1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*****************************************************************************
4 *
5 * Copyright (C) 1999-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *
8 ******************************************************************************/
9
10 /*
11 * uconv(1): an iconv(1)-like converter using ICU.
12 *
13 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
14 * contributed in 1999.
15 *
16 * Conversion to the C conversion API and many improvements by
17 * Yves Arrouye <yves@realnames.com>, current maintainer.
18 *
19 * Markus Scherer maintainer from 2003.
20 * See source code repository history for changes.
21 */
22
23 #include <unicode/utypes.h>
24 #include <unicode/putil.h>
25 #include <unicode/ucnv.h>
26 #include <unicode/uenum.h>
27 #include <unicode/unistr.h>
28 #include <unicode/translit.h>
29 #include <unicode/uset.h>
30 #include <unicode/uclean.h>
31 #include <unicode/utf16.h>
32
33 #include <stdio.h>
34 #include <errno.h>
35 #include <string.h>
36 #include <stdlib.h>
37
38 #include "cmemory.h"
39 #include "cstring.h"
40 #include "ustrfmt.h"
41
42 #include "unicode/uwmsg.h"
43
44 U_NAMESPACE_USE
45
46 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
47 #include <io.h>
48 #include <fcntl.h>
49 #if U_PLATFORM_USES_ONLY_WIN32_API
50 #define USE_FILENO_BINARY_MODE 1
51 /* Windows likes to rename Unix-like functions */
52 #ifndef fileno
53 #define fileno _fileno
54 #endif
55 #ifndef setmode
56 #define setmode _setmode
57 #endif
58 #ifndef O_BINARY
59 #define O_BINARY _O_BINARY
60 #endif
61 #endif
62 #endif
63
64 #ifdef UCONVMSG_LINK
65 /* below from the README */
66 #include "unicode/utypes.h"
67 #include "unicode/udata.h"
68 U_CFUNC char uconvmsg_dat[];
69 #endif
70
71 #define DEFAULT_BUFSZ 4096
72 #define UCONVMSG "uconvmsg"
73
74 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
75
76 /*
77 * Initialize the message bundle so that message strings can be fetched
78 * by u_wmsg().
79 *
80 */
81
initMsg(const char * pname)82 static void initMsg(const char *pname) {
83 static int ps = 0;
84
85 if (!ps) {
86 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */
87 UErrorCode err = U_ZERO_ERROR;
88
89 ps = 1;
90
91 /* Set up our static data - if any */
92 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
93 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
94 if (U_FAILURE(err)) {
95 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
96 pname, u_errorName(err));
97 err = U_ZERO_ERROR; /* It may still fail */
98 }
99 #endif
100
101 /* Get messages. */
102 gBundle = u_wmsg_setPath(UCONVMSG, &err);
103 if (U_FAILURE(err)) {
104 fprintf(stderr,
105 "%s: warning: couldn't open bundle %s: %s\n",
106 pname, UCONVMSG, u_errorName(err));
107 #ifdef UCONVMSG_LINK
108 fprintf(stderr,
109 "%s: setAppData was called, internal data %s failed to load\n",
110 pname, UCONVMSG);
111 #endif
112
113 err = U_ZERO_ERROR;
114 /* that was try #1, try again with a path */
115 uprv_strcpy(dataPath, u_getDataDirectory());
116 uprv_strcat(dataPath, U_FILE_SEP_STRING);
117 uprv_strcat(dataPath, UCONVMSG);
118
119 gBundle = u_wmsg_setPath(dataPath, &err);
120 if (U_FAILURE(err)) {
121 fprintf(stderr,
122 "%s: warning: still couldn't open bundle %s: %s\n",
123 pname, dataPath, u_errorName(err));
124 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
125 }
126 }
127 }
128 }
129
130 /* Mapping of callback names to the callbacks passed to the converter
131 API. */
132
133 static struct callback_ent {
134 const char *name;
135 UConverterFromUCallback fromu;
136 const void *fromuctxt;
137 UConverterToUCallback tou;
138 const void *touctxt;
139 } transcode_callbacks[] = {
140 { "substitute",
141 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
142 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
143 { "skip",
144 UCNV_FROM_U_CALLBACK_SKIP, 0,
145 UCNV_TO_U_CALLBACK_SKIP, 0 },
146 { "stop",
147 UCNV_FROM_U_CALLBACK_STOP, 0,
148 UCNV_TO_U_CALLBACK_STOP, 0 },
149 { "escape",
150 UCNV_FROM_U_CALLBACK_ESCAPE, 0,
151 UCNV_TO_U_CALLBACK_ESCAPE, 0},
152 { "escape-icu",
153 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
154 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
155 { "escape-java",
156 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
157 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
158 { "escape-c",
159 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
160 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
161 { "escape-xml",
162 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
163 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
164 { "escape-xml-hex",
165 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
166 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
167 { "escape-xml-dec",
168 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
169 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
170 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
171 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
172 };
173
174 /* Return a pointer to a callback record given its name. */
175
findCallback(const char * name)176 static const struct callback_ent *findCallback(const char *name) {
177 int i, count =
178 UPRV_LENGTHOF(transcode_callbacks);
179
180 /* We'll do a linear search, there aren't many of them and bsearch()
181 may not be that portable. */
182
183 for (i = 0; i < count; ++i) {
184 if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
185 return &transcode_callbacks[i];
186 }
187 }
188
189 return 0;
190 }
191
192 /* Print converter information. If lookfor is set, only that converter will
193 be printed, otherwise all converters will be printed. If canon is non
194 zero, tags and aliases for each converter are printed too, in the format
195 expected for convrters.txt(5). */
196
printConverters(const char * pname,const char * lookfor,UBool canon)197 static int printConverters(const char *pname, const char *lookfor,
198 UBool canon)
199 {
200 UErrorCode err = U_ZERO_ERROR;
201 int32_t num;
202 uint16_t num_stds;
203 const char **stds;
204
205 /* If there is a specified name, just handle that now. */
206
207 if (lookfor) {
208 if (!canon) {
209 printf("%s\n", lookfor);
210 return 0;
211 } else {
212 /* Because we are printing a canonical name, we need the
213 true converter name. We've done that already except for
214 the default name (because we want to print the exact
215 name one would get when calling ucnv_getDefaultName()
216 in non-canon mode). But since we do not know at this
217 point if we have the default name or something else, we
218 need to normalize again to the canonical converter
219 name. */
220
221 const char *truename = ucnv_getAlias(lookfor, 0, &err);
222 if (U_SUCCESS(err)) {
223 lookfor = truename;
224 } else {
225 err = U_ZERO_ERROR;
226 }
227 }
228 }
229
230 /* Print converter names. We come here for one of two reasons: we
231 are printing all the names (lookfor was null), or we have a
232 single converter to print but in canon mode, hence we need to
233 get to it in order to print everything. */
234
235 num = ucnv_countAvailable();
236 if (num <= 0) {
237 initMsg(pname);
238 u_wmsg(stderr, "cantGetNames");
239 return -1;
240 }
241 if (lookfor) {
242 num = 1; /* We know where we want to be. */
243 }
244
245 num_stds = ucnv_countStandards();
246 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
247 if (!stds) {
248 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
249 return -1;
250 } else {
251 uint16_t s;
252
253 if (canon) {
254 printf("{ ");
255 }
256 for (s = 0; s < num_stds; ++s) {
257 stds[s] = ucnv_getStandard(s, &err);
258 if (canon) {
259 printf("%s ", stds[s]);
260 }
261 if (U_FAILURE(err)) {
262 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
263 goto error_cleanup;
264 }
265 }
266 if (canon) {
267 puts("}");
268 }
269 }
270
271 for (int32_t i = 0; i < num; i++) {
272 const char *name;
273 uint16_t num_aliases;
274
275 /* Set the name either to what we are looking for, or
276 to the current converter name. */
277
278 if (lookfor) {
279 name = lookfor;
280 } else {
281 name = ucnv_getAvailableName(i);
282 }
283
284 /* Get all the aliases associated to the name. */
285
286 err = U_ZERO_ERROR;
287 num_aliases = ucnv_countAliases(name, &err);
288 if (U_FAILURE(err)) {
289 printf("%s", name);
290
291 UnicodeString str(name, "");
292 putchar('\t');
293 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
294 u_wmsg_errorName(err));
295 goto error_cleanup;
296 } else {
297 uint16_t a, s, t;
298
299 /* Write all the aliases and their tags. */
300
301 for (a = 0; a < num_aliases; ++a) {
302 const char *alias = ucnv_getAlias(name, a, &err);
303
304 if (U_FAILURE(err)) {
305 UnicodeString str(name, "");
306 putchar('\t');
307 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
308 u_wmsg_errorName(err));
309 goto error_cleanup;
310 }
311
312 /* Print the current alias so that it looks right. */
313 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
314 alias,
315 (canon ? "" : " "));
316
317 /* Look (slowly, linear searching) for a tag. */
318
319 if (canon) {
320 /* -1 to skip the last standard */
321 for (s = t = 0; s < num_stds-1; ++s) {
322 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
323 if (U_SUCCESS(err)) {
324 /* List the standard tags */
325 const char *standardName;
326 UBool isFirst = TRUE;
327 UErrorCode enumError = U_ZERO_ERROR;
328 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
329 /* See if this alias is supported by this standard. */
330 if (!strcmp(standardName, alias)) {
331 if (!t) {
332 printf(" {");
333 t = 1;
334 }
335 /* Print a * after the default standard name */
336 printf(" %s%s", stds[s], (isFirst ? "*" : ""));
337 }
338 isFirst = FALSE;
339 }
340 }
341 }
342 if (t) {
343 printf(" }");
344 }
345 }
346 /* Terminate this entry. */
347 if (canon) {
348 puts("");
349 }
350
351 /* Move on. */
352 }
353 /* Terminate this entry. */
354 if (!canon) {
355 puts("");
356 }
357 }
358 }
359
360 /* Free temporary data. */
361
362 uprv_free(stds);
363
364 /* Success. */
365
366 return 0;
367 error_cleanup:
368 uprv_free(stds);
369 return -1;
370 }
371
372 /* Print all available transliterators. If canon is non zero, print
373 one transliterator per line. */
374
printTransliterators(UBool canon)375 static int printTransliterators(UBool canon)
376 {
377 #if UCONFIG_NO_TRANSLITERATION
378 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
379 return 1;
380 #else
381 UErrorCode status = U_ZERO_ERROR;
382 UEnumeration *ids = utrans_openIDs(&status);
383 int32_t i, numtrans = uenum_count(ids, &status);
384
385 char sepchar = canon ? '\n' : ' ';
386
387 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
388 int32_t len;
389 const char *nextTrans = uenum_next(ids, &len, &status);
390
391 printf("%s", nextTrans);
392 if (i < numtrans - 1) {
393 putchar(sepchar);
394 }
395 }
396
397 uenum_close(ids);
398
399 /* Add a terminating newline if needed. */
400
401 if (sepchar != '\n') {
402 putchar('\n');
403 }
404
405 /* Success. */
406
407 return 0;
408 #endif
409 }
410
411 enum {
412 uSP = 0x20, // space
413 uCR = 0xd, // carriage return
414 uLF = 0xa, // line feed
415 uNL = 0x85, // newline
416 uLS = 0x2028, // line separator
417 uPS = 0x2029, // paragraph separator
418 uSig = 0xfeff // signature/BOM character
419 };
420
421 static inline int32_t
getChunkLimit(const UnicodeString & prev,const UnicodeString & s)422 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
423 // find one of
424 // CR, LF, CRLF, NL, LS, PS
425 // for paragraph ends (see UAX #13/Unicode 4)
426 // and include it in the chunk
427 // all of these characters are on the BMP
428 // do not include FF or VT in case they are part of a paragraph
429 // (important for bidi contexts)
430 static const UChar paraEnds[] = {
431 0xd, 0xa, 0x85, 0x2028, 0x2029
432 };
433 enum {
434 iCR, iLF, iNL, iLS, iPS, iCount
435 };
436
437 // first, see if there is a CRLF split between prev and s
438 if (prev.endsWith(paraEnds + iCR, 1)) {
439 if (s.startsWith(paraEnds + iLF, 1)) {
440 return 1; // split CRLF, include the LF
441 } else if (!s.isEmpty()) {
442 return 0; // complete the last chunk
443 } else {
444 return -1; // wait for actual further contents to arrive
445 }
446 }
447
448 const UChar *u = s.getBuffer(), *limit = u + s.length();
449 UChar c;
450
451 while (u < limit) {
452 c = *u++;
453 if (
454 ((c < uSP) && (c == uCR || c == uLF)) ||
455 (c == uNL) ||
456 ((c & uLS) == uLS)
457 ) {
458 if (c == uCR) {
459 // check for CRLF
460 if (u == limit) {
461 return -1; // LF may be in the next chunk
462 } else if (*u == uLF) {
463 ++u; // include the LF in this chunk
464 }
465 }
466 return (int32_t)(u - s.getBuffer());
467 }
468 }
469
470 return -1; // continue collecting the chunk
471 }
472
473 enum {
474 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
475 CNV_WITH_FEFF, // can convert the U+FEFF signature character
476 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
477 };
478
479 static inline UChar
nibbleToHex(uint8_t n)480 nibbleToHex(uint8_t n) {
481 n &= 0xf;
482 return
483 n <= 9 ?
484 (UChar)(0x30 + n) :
485 (UChar)((0x61 - 10) + n);
486 }
487
488 // check the converter's Unicode signature properties;
489 // the fromUnicode side of the converter must be in its initial state
490 // and will be reset again if it was used
491 static int32_t
cnvSigType(UConverter * cnv)492 cnvSigType(UConverter *cnv) {
493 UErrorCode err;
494 int32_t result;
495
496 // test if the output charset can convert U+FEFF
497 USet *set = uset_open(1, 0);
498 err = U_ZERO_ERROR;
499 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
500 if (U_SUCCESS(err) && uset_contains(set, uSig)) {
501 result = CNV_WITH_FEFF;
502 } else {
503 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
504 }
505 uset_close(set);
506
507 if (result == CNV_WITH_FEFF) {
508 // test if the output charset emits a signature anyway
509 const UChar a[1] = { 0x61 }; // "a"
510 const UChar *in;
511
512 char buffer[20];
513 char *out;
514
515 in = a;
516 out = buffer;
517 err = U_ZERO_ERROR;
518 ucnv_fromUnicode(cnv,
519 &out, buffer + sizeof(buffer),
520 &in, a + 1,
521 NULL, TRUE, &err);
522 ucnv_resetFromUnicode(cnv);
523
524 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
525 U_SUCCESS(err)
526 ) {
527 result = CNV_ADDS_FEFF;
528 }
529 }
530
531 return result;
532 }
533
534 class ConvertFile {
535 public:
ConvertFile()536 ConvertFile() :
537 buf(NULL), outbuf(NULL), fromoffsets(NULL),
538 bufsz(0), signature(0) {}
539
540 void
setBufferSize(size_t bufferSize)541 setBufferSize(size_t bufferSize) {
542 bufsz = bufferSize;
543
544 buf = new char[2 * bufsz];
545 outbuf = buf + bufsz;
546
547 // +1 for an added U+FEFF in the intermediate Unicode buffer
548 fromoffsets = new int32_t[bufsz + 1];
549 }
550
~ConvertFile()551 ~ConvertFile() {
552 delete [] buf;
553 delete [] fromoffsets;
554 }
555
556 UBool convertFile(const char *pname,
557 const char *fromcpage,
558 UConverterToUCallback toucallback,
559 const void *touctxt,
560 const char *tocpage,
561 UConverterFromUCallback fromucallback,
562 const void *fromuctxt,
563 UBool fallback,
564 const char *translit,
565 const char *infilestr,
566 FILE * outfile, int verbose);
567 private:
568 friend int main(int argc, char **argv);
569
570 char *buf, *outbuf;
571 int32_t *fromoffsets;
572
573 size_t bufsz;
574 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
575 };
576
577 // Convert a file from one encoding to another
578 UBool
convertFile(const char * pname,const char * fromcpage,UConverterToUCallback toucallback,const void * touctxt,const char * tocpage,UConverterFromUCallback fromucallback,const void * fromuctxt,UBool fallback,const char * translit,const char * infilestr,FILE * outfile,int verbose)579 ConvertFile::convertFile(const char *pname,
580 const char *fromcpage,
581 UConverterToUCallback toucallback,
582 const void *touctxt,
583 const char *tocpage,
584 UConverterFromUCallback fromucallback,
585 const void *fromuctxt,
586 UBool fallback,
587 const char *translit,
588 const char *infilestr,
589 FILE * outfile, int verbose)
590 {
591 FILE *infile;
592 UBool ret = TRUE;
593 UConverter *convfrom = 0;
594 UConverter *convto = 0;
595 UErrorCode err = U_ZERO_ERROR;
596 UBool flush;
597 UBool closeFile = FALSE;
598 const char *cbufp, *prevbufp;
599 char *bufp;
600
601 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
602
603 const UChar *unibuf, *unibufbp;
604 UChar *unibufp;
605
606 size_t rd, wr;
607
608 #if !UCONFIG_NO_TRANSLITERATION
609 Transliterator *t = 0; // Transliterator acting on Unicode data.
610 UnicodeString chunk; // One chunk of the text being collected for transformation.
611 #endif
612 UnicodeString u; // String to do the transliteration.
613 int32_t ulen;
614
615 // use conversion offsets for error messages
616 // unless a transliterator is used -
617 // a text transformation will reorder characters in unpredictable ways
618 UBool useOffsets = TRUE;
619
620 // Open the correct input file or connect to stdin for reading input
621
622 if (infilestr != 0 && strcmp(infilestr, "-")) {
623 infile = fopen(infilestr, "rb");
624 if (infile == 0) {
625 UnicodeString str1(infilestr, "");
626 str1.append((UChar32) 0);
627 UnicodeString str2(strerror(errno), "");
628 str2.append((UChar32) 0);
629 initMsg(pname);
630 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
631 return FALSE;
632 }
633 closeFile = TRUE;
634 } else {
635 infilestr = "-";
636 infile = stdin;
637 #ifdef USE_FILENO_BINARY_MODE
638 if (setmode(fileno(stdin), O_BINARY) == -1) {
639 initMsg(pname);
640 u_wmsg(stderr, "cantSetInBinMode");
641 return FALSE;
642 }
643 #endif
644 }
645
646 if (verbose) {
647 fprintf(stderr, "%s:\n", infilestr);
648 }
649
650 #if !UCONFIG_NO_TRANSLITERATION
651 // Create transliterator as needed.
652
653 if (translit != NULL && *translit) {
654 UParseError parse;
655 UnicodeString str(translit), pestr;
656
657 /* Create from rules or by ID as needed. */
658
659 parse.line = -1;
660
661 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
662 t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err);
663 } else {
664 t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err);
665 }
666
667 if (U_FAILURE(err)) {
668 str.append((UChar32) 0);
669 initMsg(pname);
670
671 if (parse.line >= 0) {
672 UChar linebuf[20], offsetbuf[20];
673 uprv_itou(linebuf, 20, parse.line, 10, 0);
674 uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
675 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
676 u_wmsg_errorName(err), linebuf, offsetbuf);
677 } else {
678 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
679 u_wmsg_errorName(err));
680 }
681
682 if (t) {
683 delete t;
684 t = 0;
685 }
686 goto error_exit;
687 }
688
689 useOffsets = FALSE;
690 }
691 #endif
692
693 // Create codepage converter. If the codepage or its aliases weren't
694 // available, it returns NULL and a failure code. We also set the
695 // callbacks, and return errors in the same way.
696
697 convfrom = ucnv_open(fromcpage, &err);
698 if (U_FAILURE(err)) {
699 UnicodeString str(fromcpage, "");
700 initMsg(pname);
701 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
702 u_wmsg_errorName(err));
703 goto error_exit;
704 }
705 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
706 if (U_FAILURE(err)) {
707 initMsg(pname);
708 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
709 goto error_exit;
710 }
711
712 convto = ucnv_open(tocpage, &err);
713 if (U_FAILURE(err)) {
714 UnicodeString str(tocpage, "");
715 initMsg(pname);
716 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
717 u_wmsg_errorName(err));
718 goto error_exit;
719 }
720 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
721 if (U_FAILURE(err)) {
722 initMsg(pname);
723 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
724 goto error_exit;
725 }
726 ucnv_setFallback(convto, fallback);
727
728 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
729 int8_t sig;
730
731 // OK, we can convert now.
732 sig = signature;
733 rd = 0;
734
735 do {
736 willexit = FALSE;
737
738 // input file offset at the beginning of the next buffer
739 infoffset += rd;
740
741 rd = fread(buf, 1, bufsz, infile);
742 if (ferror(infile) != 0) {
743 UnicodeString str(strerror(errno));
744 initMsg(pname);
745 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
746 goto error_exit;
747 }
748
749 // Convert the read buffer into the new encoding via Unicode.
750 // After the call 'unibufp' will be placed behind the last
751 // character that was converted in the 'unibuf'.
752 // Also the 'cbufp' is positioned behind the last converted
753 // character.
754 // At the last conversion in the file, flush should be set to
755 // true so that we get all characters converted.
756 //
757 // The converter must be flushed at the end of conversion so
758 // that characters on hold also will be written.
759
760 cbufp = buf;
761 flush = (UBool)(rd != bufsz);
762
763 // convert until the input is consumed
764 do {
765 // remember the start of the current byte-to-Unicode conversion
766 prevbufp = cbufp;
767
768 unibuf = unibufp = u.getBuffer((int32_t)bufsz);
769
770 // Use bufsz instead of u.getCapacity() for the targetLimit
771 // so that we don't overflow fromoffsets[].
772 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
773 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
774
775 ulen = (int32_t)(unibufp - unibuf);
776 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
777
778 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
779 // converting all of the input bytes.
780 // It works like this because ucnv_toUnicode() returns only under the
781 // following conditions:
782 // - an error occurred during conversion (an error code is set)
783 // - the target buffer is filled (the error code indicates an overflow)
784 // - the source is consumed
785 // That is, if the error code does not indicate a failure,
786 // not even an overflow, then the source must be consumed entirely.
787 fromSawEndOfBytes = (UBool)U_SUCCESS(err);
788
789 if (err == U_BUFFER_OVERFLOW_ERROR) {
790 err = U_ZERO_ERROR;
791 } else if (U_FAILURE(err)) {
792 char pos[32], errorBytes[32];
793 int8_t i, length, errorLength;
794
795 UErrorCode localError = U_ZERO_ERROR;
796 errorLength = (int8_t)sizeof(errorBytes);
797 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
798 if (U_FAILURE(localError) || errorLength == 0) {
799 errorLength = 1;
800 }
801
802 // print the input file offset of the start of the error bytes:
803 // input file offset of the current byte buffer +
804 // length of the just consumed bytes -
805 // length of the error bytes
806 length =
807 (int8_t)sprintf(pos, "%d",
808 (int)(infoffset + (cbufp - buf) - errorLength));
809
810 // output the bytes that caused the error
811 UnicodeString str;
812 for (i = 0; i < errorLength; ++i) {
813 if (i > 0) {
814 str.append((UChar)uSP);
815 }
816 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
817 str.append(nibbleToHex((uint8_t)errorBytes[i]));
818 }
819
820 initMsg(pname);
821 u_wmsg(stderr, "problemCvtToU",
822 UnicodeString(pos, length, "").getTerminatedBuffer(),
823 str.getTerminatedBuffer(),
824 u_wmsg_errorName(err));
825
826 willexit = TRUE;
827 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
828 }
829
830 // Replaced a check for whether the input was consumed by
831 // looping until it is; message key "premEndInput" now obsolete.
832
833 if (ulen == 0) {
834 continue;
835 }
836
837 // remove a U+FEFF Unicode signature character if requested
838 if (sig < 0) {
839 if (u.charAt(0) == uSig) {
840 u.remove(0, 1);
841
842 // account for the removed UChar and offset
843 --ulen;
844
845 if (useOffsets) {
846 // remove an offset from fromoffsets[] as well
847 // to keep the array parallel with the UChars
848 memmove(fromoffsets, fromoffsets + 1, ulen * 4);
849 }
850
851 }
852 sig = 0;
853 }
854
855 #if !UCONFIG_NO_TRANSLITERATION
856 // Transliterate/transform if needed.
857
858 // For transformation, we use chunking code -
859 // collect Unicode input until, for example, an end-of-line,
860 // then transform and output-convert that and continue collecting.
861 // This makes the transformation result independent of the buffer size
862 // while avoiding the slower keyboard mode.
863 // The end-of-chunk characters are completely included in the
864 // transformed string in case they are to be transformed themselves.
865 if (t != NULL) {
866 UnicodeString out;
867 int32_t chunkLimit;
868
869 do {
870 chunkLimit = getChunkLimit(chunk, u);
871 if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
872 // use all of the rest at the end of the text
873 chunkLimit = u.length();
874 }
875 if (chunkLimit >= 0) {
876 // complete the chunk and transform it
877 chunk.append(u, 0, chunkLimit);
878 u.remove(0, chunkLimit);
879 t->transliterate(chunk);
880
881 // append the transformation result to the result and empty the chunk
882 out.append(chunk);
883 chunk.remove();
884 } else {
885 // continue collecting the chunk
886 chunk.append(u);
887 break;
888 }
889 } while (!u.isEmpty());
890
891 u = out;
892 ulen = u.length();
893 }
894 #endif
895
896 // add a U+FEFF Unicode signature character if requested
897 // and possible/necessary
898 if (sig > 0) {
899 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
900 u.insert(0, (UChar)uSig);
901
902 if (useOffsets) {
903 // insert a pseudo-offset into fromoffsets[] as well
904 // to keep the array parallel with the UChars
905 memmove(fromoffsets + 1, fromoffsets, ulen * 4);
906 fromoffsets[0] = -1;
907 }
908
909 // account for the additional UChar and offset
910 ++ulen;
911 }
912 sig = 0;
913 }
914
915 // Convert the Unicode buffer into the destination codepage
916 // Again 'bufp' will be placed behind the last converted character
917 // And 'unibufp' will be placed behind the last converted unicode character
918 // At the last conversion flush should be set to true to ensure that
919 // all characters left get converted
920
921 unibuf = unibufbp = u.getBuffer();
922
923 do {
924 bufp = outbuf;
925
926 // Use fromSawEndOfBytes in addition to the flush flag -
927 // it indicates whether the intermediate Unicode string
928 // contains the very last UChars for the very last input bytes.
929 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
930 &unibufbp,
931 unibuf + ulen,
932 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
933
934 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
935 // converting all of the intermediate UChars.
936 // See comment for fromSawEndOfBytes.
937 toSawEndOfUnicode = (UBool)U_SUCCESS(err);
938
939 if (err == U_BUFFER_OVERFLOW_ERROR) {
940 err = U_ZERO_ERROR;
941 } else if (U_FAILURE(err)) {
942 UChar errorUChars[4];
943 const char *errtag;
944 char pos[32];
945 UChar32 c;
946 int8_t i, length, errorLength;
947
948 UErrorCode localError = U_ZERO_ERROR;
949 errorLength = UPRV_LENGTHOF(errorUChars);
950 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
951 if (U_FAILURE(localError) || errorLength == 0) {
952 // need at least 1 so that we don't access beyond the length of fromoffsets[]
953 errorLength = 1;
954 }
955
956 int32_t ferroffset;
957
958 if (useOffsets) {
959 // Unicode buffer offset of the start of the error UChars
960 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
961 if (ferroffset < 0) {
962 // approximation - the character started in the previous Unicode buffer
963 ferroffset = 0;
964 }
965
966 // get the corresponding byte offset out of fromoffsets[]
967 // go back if the offset is not known for some of the UChars
968 int32_t fromoffset;
969 do {
970 fromoffset = fromoffsets[ferroffset];
971 } while (fromoffset < 0 && --ferroffset >= 0);
972
973 // total input file offset =
974 // input file offset of the current byte buffer +
975 // byte buffer offset of where the current Unicode buffer is converted from +
976 // fromoffsets[Unicode offset]
977 ferroffset = infoffset + (prevbufp - buf) + fromoffset;
978 errtag = "problemCvtFromU";
979 } else {
980 // Do not use fromoffsets if (t != NULL) because the Unicode text may
981 // be different from what the offsets refer to.
982
983 // output file offset
984 ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
985 errtag = "problemCvtFromUOut";
986 }
987
988 length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
989
990 // output the code points that caused the error
991 UnicodeString str;
992 for (i = 0; i < errorLength;) {
993 if (i > 0) {
994 str.append((UChar)uSP);
995 }
996 U16_NEXT(errorUChars, i, errorLength, c);
997 if (c >= 0x100000) {
998 str.append(nibbleToHex((uint8_t)(c >> 20)));
999 }
1000 if (c >= 0x10000) {
1001 str.append(nibbleToHex((uint8_t)(c >> 16)));
1002 }
1003 str.append(nibbleToHex((uint8_t)(c >> 12)));
1004 str.append(nibbleToHex((uint8_t)(c >> 8)));
1005 str.append(nibbleToHex((uint8_t)(c >> 4)));
1006 str.append(nibbleToHex((uint8_t)c));
1007 }
1008
1009 initMsg(pname);
1010 u_wmsg(stderr, errtag,
1011 UnicodeString(pos, length, "").getTerminatedBuffer(),
1012 str.getTerminatedBuffer(),
1013 u_wmsg_errorName(err));
1014 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1015
1016 willexit = TRUE;
1017 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1018 }
1019
1020 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1021 // looping until they are; message key "premEnd" now obsolete.
1022
1023 // Finally, write the converted buffer to the output file
1024 size_t outlen = (size_t) (bufp - outbuf);
1025 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1026 if (wr != outlen) {
1027 UnicodeString str(strerror(errno));
1028 initMsg(pname);
1029 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1030 willexit = TRUE;
1031 }
1032
1033 if (willexit) {
1034 goto error_exit;
1035 }
1036 } while (!toSawEndOfUnicode);
1037 } while (!fromSawEndOfBytes);
1038 } while (!flush); // Stop when we have flushed the
1039 // converters (this means that it's
1040 // the end of output)
1041
1042 goto normal_exit;
1043
1044 error_exit:
1045 ret = FALSE;
1046
1047 normal_exit:
1048 // Cleanup.
1049
1050 ucnv_close(convfrom);
1051 ucnv_close(convto);
1052
1053 #if !UCONFIG_NO_TRANSLITERATION
1054 delete t;
1055 #endif
1056
1057 if (closeFile) {
1058 fclose(infile);
1059 }
1060
1061 return ret;
1062 }
1063
usage(const char * pname,int ecode)1064 static void usage(const char *pname, int ecode) {
1065 const UChar *msg;
1066 int32_t msgLen;
1067 UErrorCode err = U_ZERO_ERROR;
1068 FILE *fp = ecode ? stderr : stdout;
1069 int res;
1070
1071 initMsg(pname);
1072 msg =
1073 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1074 &msgLen, &err);
1075 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1076 UnicodeString mname(msg, msgLen + 1);
1077
1078 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1079 if (!ecode) {
1080 if (!res) {
1081 fputc('\n', fp);
1082 }
1083 if (!u_wmsg(fp, "help")) {
1084 /* Now dump callbacks and finish. */
1085
1086 int i, count =
1087 UPRV_LENGTHOF(transcode_callbacks);
1088 for (i = 0; i < count; ++i) {
1089 fprintf(fp, " %s", transcode_callbacks[i].name);
1090 }
1091 fputc('\n', fp);
1092 }
1093 }
1094
1095 exit(ecode);
1096 }
1097
1098 extern int
main(int argc,char ** argv)1099 main(int argc, char **argv)
1100 {
1101 FILE *outfile;
1102 int ret = 0;
1103
1104 size_t bufsz = DEFAULT_BUFSZ;
1105
1106 const char *fromcpage = 0;
1107 const char *tocpage = 0;
1108 const char *translit = 0;
1109 const char *outfilestr = 0;
1110 UBool fallback = FALSE;
1111
1112 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1113 const void *fromuctxt = 0;
1114 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1115 const void *touctxt = 0;
1116
1117 char **iter, **remainArgv, **remainArgvLimit;
1118 char **end = argv + argc;
1119
1120 const char *pname;
1121
1122 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1123 const char *printName = 0;
1124
1125 UBool verbose = FALSE;
1126 UErrorCode status = U_ZERO_ERROR;
1127
1128 ConvertFile cf;
1129
1130 /* Initialize ICU */
1131 u_init(&status);
1132 if (U_FAILURE(status)) {
1133 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
1134 argv[0], u_errorName(status));
1135 exit(1);
1136 }
1137
1138 // Get and prettify pname.
1139 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1140 #if U_PLATFORM_USES_ONLY_WIN32_API
1141 if (!pname) {
1142 pname = uprv_strrchr(*argv, '/');
1143 }
1144 #endif
1145 if (!pname) {
1146 pname = *argv;
1147 } else {
1148 ++pname;
1149 }
1150
1151 // First, get the arguments from command-line
1152 // to know the codepages to convert between
1153
1154 remainArgv = remainArgvLimit = argv + 1;
1155 for (iter = argv + 1; iter != end; iter++) {
1156 // Check for from charset
1157 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1158 iter++;
1159 if (iter != end)
1160 fromcpage = *iter;
1161 else
1162 usage(pname, 1);
1163 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1164 iter++;
1165 if (iter != end)
1166 tocpage = *iter;
1167 else
1168 usage(pname, 1);
1169 } else if (strcmp("-x", *iter) == 0) {
1170 iter++;
1171 if (iter != end)
1172 translit = *iter;
1173 else
1174 usage(pname, 1);
1175 } else if (!strcmp("--fallback", *iter)) {
1176 fallback = TRUE;
1177 } else if (!strcmp("--no-fallback", *iter)) {
1178 fallback = FALSE;
1179 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1180 iter++;
1181 if (iter != end) {
1182 bufsz = atoi(*iter);
1183 if ((int) bufsz <= 0) {
1184 initMsg(pname);
1185 UnicodeString str(*iter);
1186 initMsg(pname);
1187 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1188 return 3;
1189 }
1190 } else {
1191 usage(pname, 1);
1192 }
1193 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1194 if (printTranslits) {
1195 usage(pname, 1);
1196 }
1197 printConvs = TRUE;
1198 } else if (strcmp("--default-code", *iter) == 0) {
1199 if (printTranslits) {
1200 usage(pname, 1);
1201 }
1202 printName = ucnv_getDefaultName();
1203 } else if (strcmp("--list-code", *iter) == 0) {
1204 if (printTranslits) {
1205 usage(pname, 1);
1206 }
1207
1208 iter++;
1209 if (iter != end) {
1210 UErrorCode e = U_ZERO_ERROR;
1211 printName = ucnv_getAlias(*iter, 0, &e);
1212 if (U_FAILURE(e) || !printName) {
1213 UnicodeString str(*iter);
1214 initMsg(pname);
1215 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1216 return 2;
1217 }
1218 } else
1219 usage(pname, 1);
1220 } else if (strcmp("--canon", *iter) == 0) {
1221 printCanon = TRUE;
1222 } else if (strcmp("-L", *iter) == 0
1223 || !strcmp("--list-transliterators", *iter)) {
1224 if (printConvs) {
1225 usage(pname, 1);
1226 }
1227 printTranslits = TRUE;
1228 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1229 || !strcmp("--help", *iter)) {
1230 usage(pname, 0);
1231 } else if (!strcmp("-c", *iter)) {
1232 fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1233 } else if (!strcmp("--to-callback", *iter)) {
1234 iter++;
1235 if (iter != end) {
1236 const struct callback_ent *cbe = findCallback(*iter);
1237 if (cbe) {
1238 fromucallback = cbe->fromu;
1239 fromuctxt = cbe->fromuctxt;
1240 } else {
1241 UnicodeString str(*iter);
1242 initMsg(pname);
1243 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1244 return 4;
1245 }
1246 } else {
1247 usage(pname, 1);
1248 }
1249 } else if (!strcmp("--from-callback", *iter)) {
1250 iter++;
1251 if (iter != end) {
1252 const struct callback_ent *cbe = findCallback(*iter);
1253 if (cbe) {
1254 toucallback = cbe->tou;
1255 touctxt = cbe->touctxt;
1256 } else {
1257 UnicodeString str(*iter);
1258 initMsg(pname);
1259 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1260 return 4;
1261 }
1262 } else {
1263 usage(pname, 1);
1264 }
1265 } else if (!strcmp("-i", *iter)) {
1266 toucallback = UCNV_TO_U_CALLBACK_SKIP;
1267 } else if (!strcmp("--callback", *iter)) {
1268 iter++;
1269 if (iter != end) {
1270 const struct callback_ent *cbe = findCallback(*iter);
1271 if (cbe) {
1272 fromucallback = cbe->fromu;
1273 fromuctxt = cbe->fromuctxt;
1274 toucallback = cbe->tou;
1275 touctxt = cbe->touctxt;
1276 } else {
1277 UnicodeString str(*iter);
1278 initMsg(pname);
1279 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1280 return 4;
1281 }
1282 } else {
1283 usage(pname, 1);
1284 }
1285 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1286 verbose = FALSE;
1287 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1288 verbose = TRUE;
1289 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1290 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
1291 return 0;
1292 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1293 ++iter;
1294 if (iter != end && !outfilestr) {
1295 outfilestr = *iter;
1296 } else {
1297 usage(pname, 1);
1298 }
1299 } else if (0 == strcmp("--add-signature", *iter)) {
1300 cf.signature = 1;
1301 } else if (0 == strcmp("--remove-signature", *iter)) {
1302 cf.signature = -1;
1303 } else if (**iter == '-' && (*iter)[1]) {
1304 usage(pname, 1);
1305 } else {
1306 // move a non-option up in argv[]
1307 *remainArgvLimit++ = *iter;
1308 }
1309 }
1310
1311 if (printConvs || printName) {
1312 return printConverters(pname, printName, printCanon) ? 2 : 0;
1313 } else if (printTranslits) {
1314 return printTransliterators(printCanon) ? 3 : 0;
1315 }
1316
1317 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1318 fromcpage = ucnv_getDefaultName();
1319 }
1320 if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1321 tocpage = ucnv_getDefaultName();
1322 }
1323
1324 // Open the correct output file or connect to stdout for reading input
1325 if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1326 outfile = fopen(outfilestr, "wb");
1327 if (outfile == 0) {
1328 UnicodeString str1(outfilestr, "");
1329 UnicodeString str2(strerror(errno), "");
1330 initMsg(pname);
1331 u_wmsg(stderr, "cantCreateOutputF",
1332 str1.getBuffer(), str2.getBuffer());
1333 return 1;
1334 }
1335 } else {
1336 outfilestr = "-";
1337 outfile = stdout;
1338 #ifdef USE_FILENO_BINARY_MODE
1339 if (setmode(fileno(outfile), O_BINARY) == -1) {
1340 u_wmsg(stderr, "cantSetOutBinMode");
1341 exit(-1);
1342 }
1343 #endif
1344 }
1345
1346 /* Loop again on the arguments to find all the input files, and
1347 convert them. */
1348
1349 cf.setBufferSize(bufsz);
1350
1351 if(remainArgv < remainArgvLimit) {
1352 for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1353 if (!cf.convertFile(
1354 pname, fromcpage, toucallback, touctxt, tocpage,
1355 fromucallback, fromuctxt, fallback, translit, *iter,
1356 outfile, verbose)
1357 ) {
1358 goto error_exit;
1359 }
1360 }
1361 } else {
1362 if (!cf.convertFile(
1363 pname, fromcpage, toucallback, touctxt, tocpage,
1364 fromucallback, fromuctxt, fallback, translit, 0,
1365 outfile, verbose)
1366 ) {
1367 goto error_exit;
1368 }
1369 }
1370
1371 goto normal_exit;
1372 error_exit:
1373 #if !UCONFIG_NO_LEGACY_CONVERSION
1374 ret = 1;
1375 #else
1376 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1377 #endif
1378 normal_exit:
1379
1380 if (outfile != stdout) {
1381 fclose(outfile);
1382 }
1383
1384 u_cleanup();
1385
1386 return ret;
1387 }
1388
1389
1390 /*
1391 * Hey, Emacs, please set the following:
1392 *
1393 * Local Variables:
1394 * indent-tabs-mode: nil
1395 * End:
1396 *
1397 */
1398