1 /**************************************************************************
2 *
3 * Copyright (C) 2000-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ***************************************************************************
7 * file name: convsamp.c
8 * encoding: ASCII (7-bit)
9 *
10 * created on: 2000may30
11 * created by: Steven R. Loomis
12 *
13 * Sample code for the ICU conversion routines.
14 *
15 * Note: Nothing special is needed to build this sample. Link with
16 * the icu UC and icu I18N libraries.
17 *
18 * I use 'assert' for error checking, you probably will want
19 * something more flexible. '***BEGIN SAMPLE***' and
20 * '***END SAMPLE***' mark pieces suitable for stand alone
21 * code snippets.
22 *
23 *
24 * Each test can define it's own BUFFERSIZE
25 *
26 */
27
28 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
29
30 #include <stdio.h>
31 #include <ctype.h> /* for isspace, etc. */
32 #include <assert.h>
33 #include <string.h>
34 #include <stdlib.h> /* malloc */
35
36 #include "unicode/utypes.h" /* Basic ICU data types */
37 #include "unicode/ucnv.h" /* C Converter API */
38 #include "unicode/ustring.h" /* some more string fcns*/
39 #include "unicode/uchar.h" /* char names */
40 #include "unicode/uloc.h"
41 #include "unicode/unistr.h"
42
43 #include "flagcb.h"
44
45 /* Some utility functions */
46
47 static const UChar kNone[] = { 0x0000 };
48
49 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
50
51 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)52 void prettyPrintUChar(UChar c)
53 {
54 if( (c <= 0x007F) &&
55 (isgraph(c)) ) {
56 printf(" '%c' ", (char)(0x00FF&c));
57 } else if ( c > 0x007F ) {
58 char buf[1000];
59 UErrorCode status = U_ZERO_ERROR;
60 int32_t o;
61
62 o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status);
63 if(U_SUCCESS(status) && (o>0) ) {
64 buf[6] = 0;
65 printf("%7s", buf);
66 } else {
67 o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status);
68 if(U_SUCCESS(status) && (o>0)) {
69 buf[5] = 0;
70 printf("~%6s", buf);
71 }
72 else {
73 printf(" ??????");
74 }
75 }
76 } else {
77 switch((char)(c & 0x007F)) {
78 case ' ':
79 printf(" ' ' ");
80 break;
81 case '\t':
82 printf(" \\t ");
83 break;
84 case '\n':
85 printf(" \\n ");
86 break;
87 default:
88 printf(" _ ");
89 break;
90 }
91 }
92 }
93
94
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)95 void printUChars(const char *name = "?",
96 const UChar *uch = kNone,
97 int32_t len = -1 )
98 {
99 int32_t i;
100
101 if( (len == -1) && (uch) ) {
102 len = u_strlen(uch);
103 }
104
105 printf("%5s: ", name);
106 for( i = 0; i <len; i++) {
107 printf("%-6d ", i);
108 }
109 printf("\n");
110
111 printf("%5s: ", "uni");
112 for( i = 0; i <len; i++) {
113 printf("\\u%04X ", (int)uch[i]);
114 }
115 printf("\n");
116
117 printf("%5s:", "ch");
118 for( i = 0; i <len; i++) {
119 prettyPrintUChar(uch[i]);
120 }
121 printf("\n");
122 }
123
printBytes(const char * name="?",const char * uch="",int32_t len=-1)124 void printBytes(const char *name = "?",
125 const char *uch = "",
126 int32_t len = -1 )
127 {
128 int32_t i;
129
130 if( (len == -1) && (uch) ) {
131 len = strlen(uch);
132 }
133
134 printf("%5s: ", name);
135 for( i = 0; i <len; i++) {
136 printf("%-4d ", i);
137 }
138 printf("\n");
139
140 printf("%5s: ", "uni");
141 for( i = 0; i <len; i++) {
142 printf("\\x%02X ", 0x00FF & (int)uch[i]);
143 }
144 printf("\n");
145
146 printf("%5s:", "ch");
147 for( i = 0; i <len; i++) {
148 if(isgraph(0x00FF & (int)uch[i])) {
149 printf(" '%c' ", (char)uch[i]);
150 } else {
151 printf(" ");
152 }
153 }
154 printf("\n");
155 }
156
printUChar(UChar32 ch32)157 void printUChar(UChar32 ch32)
158 {
159 if(ch32 > 0xFFFF) {
160 printf("ch: U+%06X\n", ch32);
161 }
162 else {
163 UChar ch = (UChar)ch32;
164 printUChars("C", &ch, 1);
165 }
166 }
167
168 /*******************************************************************
169 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
170 followed by an exclamation mark (!) into the KOI8-R Russian code page.
171
172 This example first creates a UChar String out of the Unicode chars.
173
174 targetSize must be set to the amount of space available in the target
175 buffer. After fromUChars is called,
176 len will contain the number of bytes in target[] which were
177 used in the resulting codepage. In this case, there is a 1:1 mapping
178 between the input and output characters. The exclamation mark has the
179 same value in both KOI8-R and Unicode.
180
181 src: 0 1 2 3 4 5 6
182 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
183 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
184
185 targ: 0 1 2 3 4 5 6
186 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
187 ch: '!'
188
189
190 Converting FROM unicode
191 to koi8-r.
192 You must call ucnv_close to clean up the memory used by the
193 converter.
194
195 'len' returns the number of OUTPUT bytes resulting from the
196 conversion.
197 */
198
convsample_02()199 UErrorCode convsample_02()
200 {
201 printf("\n\n==============================================\n"
202 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
203
204
205 // **************************** START SAMPLE *******************
206 // "cat<cat>OK"
207 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
208 0x0430, 0x0021, 0x0000 };
209 char target[100];
210 UErrorCode status = U_ZERO_ERROR;
211 UConverter *conv;
212 int32_t len;
213
214 // set up the converter
215 conv = ucnv_open("koi8-r", &status);
216 assert(U_SUCCESS(status));
217
218 // convert to koi8-r
219 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
220 assert(U_SUCCESS(status));
221
222 // close the converter
223 ucnv_close(conv);
224
225 // ***************************** END SAMPLE ********************
226
227 // Print it out
228 printUChars("src", source);
229 printf("\n");
230 printBytes("targ", target, len);
231
232 return U_ZERO_ERROR;
233 }
234
235
convsample_03()236 UErrorCode convsample_03()
237 {
238 printf("\n\n==============================================\n"
239 "Sample 03: C: print out all converters\n");
240
241 int32_t count;
242 int32_t i;
243
244 // **************************** START SAMPLE *******************
245 count = ucnv_countAvailable();
246 printf("Available converters: %d\n", count);
247
248 for(i=0;i<count;i++)
249 {
250 printf("%s ", ucnv_getAvailableName(i));
251 }
252
253 // ***************************** END SAMPLE ********************
254
255 printf("\n");
256
257 return U_ZERO_ERROR;
258 }
259
260
261
262 #define BUFFERSIZE 17 /* make it interesting :) */
263
264 /*
265 Converting from a codepage to Unicode in bulk..
266 What is the best way to determine the buffer size?
267
268 The 'buffersize' is in bytes of input.
269 For a given converter, divinding this by the minimum char size
270 give you the maximum number of Unicode characters that could be
271 expected for a given number of input bytes.
272 see: ucnv_getMinCharSize()
273
274 For example, a single byte codepage like 'Latin-3' has a
275 minimum char size of 1. (It takes at least 1 byte to represent
276 each Unicode char.) So the unicode buffer has the same number of
277 UChars as the input buffer has bytes.
278
279 In a strictly double byte codepage such as cp1362 (Windows
280 Korean), the minimum char size is 2. So, only half as many Unicode
281 chars as bytes are needed.
282
283 This work to calculate the buffer size is an optimization. Any
284 size of input and output buffer can be used, as long as the
285 program handles the following cases: If the input buffer is empty,
286 the source pointer will be equal to sourceLimit. If the output
287 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
288 */
289
convsample_05()290 UErrorCode convsample_05()
291 {
292 printf("\n\n==============================================\n"
293 "Sample 05: C: count the number of letters in a UTF-8 document\n");
294
295 FILE *f;
296 int32_t count;
297 char inBuf[BUFFERSIZE];
298 const char *source;
299 const char *sourceLimit;
300 UChar *uBuf;
301 UChar *target;
302 UChar *targetLimit;
303 UChar *p;
304 int32_t uBufSize = 0;
305 UConverter *conv;
306 UErrorCode status = U_ZERO_ERROR;
307 uint32_t letters=0, total=0;
308
309 f = fopen("data01.txt", "r");
310 if(!f)
311 {
312 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
313 return U_FILE_ACCESS_ERROR;
314 }
315
316 // **************************** START SAMPLE *******************
317 conv = ucnv_open("utf-8", &status);
318 assert(U_SUCCESS(status));
319
320 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
321 printf("input bytes %d / min chars %d = %d UChars\n",
322 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
323 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
324 assert(uBuf!=NULL);
325
326 // grab another buffer's worth
327 while((!feof(f)) &&
328 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
329 {
330 // Convert bytes to unicode
331 source = inBuf;
332 sourceLimit = inBuf + count;
333
334 do
335 {
336 target = uBuf;
337 targetLimit = uBuf + uBufSize;
338
339 ucnv_toUnicode(conv, &target, targetLimit,
340 &source, sourceLimit, NULL,
341 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
342 /* is true (when no more data will come) */
343 &status);
344
345 if(status == U_BUFFER_OVERFLOW_ERROR)
346 {
347 // simply ran out of space - we'll reset the target ptr the next
348 // time through the loop.
349 status = U_ZERO_ERROR;
350 }
351 else
352 {
353 // Check other errors here.
354 assert(U_SUCCESS(status));
355 // Break out of the loop (by force)
356 }
357
358 // Process the Unicode
359 // Todo: handle UTF-16/surrogates
360
361 for(p = uBuf; p<target; p++)
362 {
363 if(u_isalpha(*p))
364 letters++;
365 total++;
366 }
367 } while (source < sourceLimit); // while simply out of space
368 }
369
370 printf("%d letters out of %d total UChars.\n", letters, total);
371
372 // ***************************** END SAMPLE ********************
373 ucnv_close(conv);
374
375 printf("\n");
376
377 fclose(f);
378
379 return U_ZERO_ERROR;
380 }
381 #undef BUFFERSIZE
382
383 #define BUFFERSIZE 1024
384 typedef struct
385 {
386 UChar32 codepoint;
387 uint32_t frequency;
388 } CharFreqInfo;
389
convsample_06()390 UErrorCode convsample_06()
391 {
392 printf("\n\n==============================================\n"
393 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
394
395 FILE *f;
396 int32_t count;
397 char inBuf[BUFFERSIZE];
398 const char *source;
399 const char *sourceLimit;
400 UChar *uBuf;
401 int32_t uBufSize = 0;
402 UConverter *conv;
403 UErrorCode status = U_ZERO_ERROR;
404 uint32_t letters=0, total=0;
405
406 CharFreqInfo *info;
407 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
408 UChar32 p;
409
410 uint32_t ie = 0;
411 uint32_t gh = 0;
412 UChar32 l = 0;
413
414 f = fopen("data06.txt", "r");
415 if(!f)
416 {
417 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
418 return U_FILE_ACCESS_ERROR;
419 }
420
421 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
422 if(!info)
423 {
424 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
425 }
426
427 /* reset frequencies */
428 for(p=0;p<charCount;p++)
429 {
430 info[p].codepoint = p;
431 info[p].frequency = 0;
432 }
433
434 // **************************** START SAMPLE *******************
435 conv = ucnv_open("utf-8", &status);
436 assert(U_SUCCESS(status));
437
438 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
439 printf("input bytes %d / min chars %d = %d UChars\n",
440 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
441 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
442 assert(uBuf!=NULL);
443
444 // grab another buffer's worth
445 while((!feof(f)) &&
446 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
447 {
448 // Convert bytes to unicode
449 source = inBuf;
450 sourceLimit = inBuf + count;
451
452 while(source < sourceLimit)
453 {
454 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
455 if(U_FAILURE(status))
456 {
457 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
458 status = U_ZERO_ERROR;
459 continue;
460 }
461 U_ASSERT(status);
462 total++;
463
464 if(u_isalpha(p))
465 letters++;
466
467 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
468 ie++;
469
470 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
471 gh++;
472
473 if(p>charCount)
474 {
475 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
476 return U_UNSUPPORTED_ERROR;
477 }
478 info[p].frequency++;
479 l = p;
480 }
481 }
482
483 fclose(f);
484 ucnv_close(conv);
485
486 printf("%d letters out of %d total UChars.\n", letters, total);
487 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
488
489 // now, we could sort it..
490
491 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
492
493 for(p=0;p<charCount;p++)
494 {
495 if(info[p].frequency)
496 {
497 printf("% 5d U+%06X ", info[p].frequency, p);
498 if(p <= 0xFFFF)
499 {
500 prettyPrintUChar((UChar)p);
501 }
502 printf("\n");
503 }
504 }
505 free(info);
506 // ***************************** END SAMPLE ********************
507
508 printf("\n");
509
510 return U_ZERO_ERROR;
511 }
512 #undef BUFFERSIZE
513
514
515 /******************************************************
516 You must call ucnv_close to clean up the memory used by the
517 converter.
518
519 'len' returns the number of OUTPUT bytes resulting from the
520 conversion.
521 */
522
convsample_12()523 UErrorCode convsample_12()
524 {
525 printf("\n\n==============================================\n"
526 "Sample 12: C: simple sjis -> unicode conversion\n");
527
528
529 // **************************** START SAMPLE *******************
530
531 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
532 UChar target[100];
533 UErrorCode status = U_ZERO_ERROR;
534 UConverter *conv;
535 int32_t len;
536
537 // set up the converter
538 conv = ucnv_open("shift_jis", &status);
539 assert(U_SUCCESS(status));
540
541 // convert to Unicode
542 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
543 target[6] = 0xFDCA;
544 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
545 U_ASSERT(status);
546 // close the converter
547 ucnv_close(conv);
548
549 // ***************************** END SAMPLE ********************
550
551 // Print it out
552 printBytes("src", source, strlen(source) );
553 printf("\n");
554 printUChars("targ", target, len);
555
556 return U_ZERO_ERROR;
557 }
558
559 /******************************************************************
560 C: Convert from codepage to Unicode one at a time.
561 */
562
convsample_13()563 UErrorCode convsample_13()
564 {
565 printf("\n\n==============================================\n"
566 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
567
568
569 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
570 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
571 const char *source, *sourceLimit;
572 UChar32 target;
573 UErrorCode status = U_ZERO_ERROR;
574 UConverter *conv = NULL;
575 int32_t srcCount=0;
576 int32_t dstCount=0;
577
578 srcCount = sizeof(sourceChars);
579
580 conv = ucnv_open("Big5", &status);
581 U_ASSERT(status);
582
583 source = sourceChars;
584 sourceLimit = sourceChars + sizeof(sourceChars);
585
586 // **************************** START SAMPLE *******************
587
588
589 printBytes("src",source,sourceLimit-source);
590
591 while(source < sourceLimit)
592 {
593 puts("");
594 target = ucnv_getNextUChar (conv,
595 &source,
596 sourceLimit,
597 &status);
598
599 // printBytes("src",source,sourceLimit-source);
600 U_ASSERT(status);
601 printUChar(target);
602 dstCount++;
603 }
604
605
606 // ************************** END SAMPLE *************************
607
608 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
609 ucnv_close(conv);
610
611 return U_ZERO_ERROR;
612 }
613
614
615
616
convsample_20_didSubstitute(const char * source)617 UBool convsample_20_didSubstitute(const char *source)
618 {
619 UChar uchars[100];
620 char bytes[100];
621 UConverter *conv = NULL;
622 UErrorCode status = U_ZERO_ERROR;
623 uint32_t len, len2;
624 UBool flagVal;
625
626 FromUFLAGContext * context = NULL;
627
628 printf("\n\n==============================================\n"
629 "Sample 20: C: Test for substitution using callbacks\n");
630
631 /* print out the original source */
632 printBytes("src", source);
633 printf("\n");
634
635 /* First, convert from UTF8 to unicode */
636 conv = ucnv_open("utf-8", &status);
637 U_ASSERT(status);
638
639 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
640 U_ASSERT(status);
641
642 printUChars("uch", uchars, len);
643 printf("\n");
644
645 /* Now, close the converter */
646 ucnv_close(conv);
647
648 /* Now, convert to windows-1252 */
649 conv = ucnv_open("windows-1252", &status);
650 U_ASSERT(status);
651
652 /* Converter starts out with the SUBSTITUTE callback set. */
653
654 /* initialize our callback */
655 context = flagCB_fromU_openContext();
656
657 /* Set our special callback */
658 ucnv_setFromUCallBack(conv,
659 flagCB_fromU,
660 context,
661 &(context->subCallback),
662 &(context->subContext),
663 &status);
664
665 U_ASSERT(status);
666
667 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
668 U_ASSERT(status);
669
670 flagVal = context->flag; /* it's about to go away when we close the cnv */
671
672 ucnv_close(conv);
673
674 /* print out the original source */
675 printBytes("bytes", bytes, len2);
676
677 return flagVal; /* true if callback was called */
678 }
679
convsample_20()680 UErrorCode convsample_20()
681 {
682 const char *sample1 = "abc\xdf\xbf";
683 const char *sample2 = "abc_def";
684
685
686 if(convsample_20_didSubstitute(sample1))
687 {
688 printf("DID substitute.\n******\n");
689 }
690 else
691 {
692 printf("Did NOT substitute.\n*****\n");
693 }
694
695 if(convsample_20_didSubstitute(sample2))
696 {
697 printf("DID substitute.\n******\n");
698 }
699 else
700 {
701 printf("Did NOT substitute.\n*****\n");
702 }
703
704 return U_ZERO_ERROR;
705 }
706
707 // 21 - C, callback, with clone and debug
708
709
710
convsample_21_didSubstitute(const char * source)711 UBool convsample_21_didSubstitute(const char *source)
712 {
713 UChar uchars[100];
714 char bytes[100];
715 UConverter *conv = NULL, *cloneCnv = NULL;
716 UErrorCode status = U_ZERO_ERROR;
717 uint32_t len, len2;
718 int32_t cloneLen;
719 UBool flagVal = FALSE;
720 UConverterFromUCallback junkCB;
721
722 FromUFLAGContext *flagCtx = NULL,
723 *cloneFlagCtx = NULL;
724
725 debugCBContext *debugCtx1 = NULL,
726 *debugCtx2 = NULL,
727 *cloneDebugCtx = NULL;
728
729 printf("\n\n==============================================\n"
730 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
731
732 /* print out the original source */
733 printBytes("src", source);
734 printf("\n");
735
736 /* First, convert from UTF8 to unicode */
737 conv = ucnv_open("utf-8", &status);
738 U_ASSERT(status);
739
740 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
741 U_ASSERT(status);
742
743 printUChars("uch", uchars, len);
744 printf("\n");
745
746 /* Now, close the converter */
747 ucnv_close(conv);
748
749 /* Now, convert to windows-1252 */
750 conv = ucnv_open("windows-1252", &status);
751 U_ASSERT(status);
752
753 /* Converter starts out with the SUBSTITUTE callback set. */
754
755 /* initialize our callback */
756 /* from the 'bottom' innermost, out
757 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
758
759 #if DEBUG_TMI
760 printf("flagCB_fromU = %p\n", &flagCB_fromU);
761 printf("debugCB_fromU = %p\n", &debugCB_fromU);
762 #endif
763
764 debugCtx1 = debugCB_openContext();
765 flagCtx = flagCB_fromU_openContext();
766 debugCtx2 = debugCB_openContext();
767
768 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
769 debugCtx1->subContext = flagCtx;
770
771 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
772 flagCtx->subContext = debugCtx2;
773
774 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
775 debugCtx2->subContext = NULL;
776
777 /* Set our special callback */
778
779 ucnv_setFromUCallBack(conv,
780 debugCB_fromU,
781 debugCtx1,
782 &(debugCtx2->subCallback),
783 &(debugCtx2->subContext),
784 &status);
785
786 U_ASSERT(status);
787
788 #if DEBUG_TMI
789 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
790 conv, debugCtx1, debugCtx1->subCallback,
791 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
792 #endif
793
794 cloneLen = 1; /* but passing in null so it will clone */
795 cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status);
796
797 U_ASSERT(status);
798
799 #if DEBUG_TMI
800 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
801 #endif
802
803 ucnv_close(conv);
804
805 #if DEBUG_TMI
806 printf("%p closed.\n", conv);
807 #endif
808
809 U_ASSERT(status);
810 /* Now, we have to extract the context */
811 cloneDebugCtx = NULL;
812 cloneFlagCtx = NULL;
813
814 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
815 if(cloneDebugCtx != NULL) {
816 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
817 }
818
819 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
820 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
821
822 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
823 U_ASSERT(status);
824
825 if(cloneFlagCtx != NULL) {
826 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
827 } else {
828 printf("** Warning, couldn't get the subcallback \n");
829 }
830
831 ucnv_close(cloneCnv);
832
833 /* print out the original source */
834 printBytes("bytes", bytes, len2);
835
836 return flagVal; /* true if callback was called */
837 }
838
convsample_21()839 UErrorCode convsample_21()
840 {
841 const char *sample1 = "abc\xdf\xbf";
842 const char *sample2 = "abc_def";
843
844 if(convsample_21_didSubstitute(sample1))
845 {
846 printf("DID substitute.\n******\n");
847 }
848 else
849 {
850 printf("Did NOT substitute.\n*****\n");
851 }
852
853 if(convsample_21_didSubstitute(sample2))
854 {
855 printf("DID substitute.\n******\n");
856 }
857 else
858 {
859 printf("Did NOT substitute.\n*****\n");
860 }
861
862 return U_ZERO_ERROR;
863 }
864
865
866 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
867
868 #define BUFFERSIZE 17 /* make it interesting :) */
869
convsample_40()870 UErrorCode convsample_40()
871 {
872 printf("\n\n==============================================\n"
873 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
874
875 FILE *f;
876 FILE *out;
877 int32_t count;
878 char inBuf[BUFFERSIZE];
879 const char *source;
880 const char *sourceLimit;
881 UChar *uBuf;
882 UChar *target;
883 UChar *targetLimit;
884 int32_t uBufSize = 0;
885 UConverter *conv = NULL;
886 UErrorCode status = U_ZERO_ERROR;
887 uint32_t inbytes=0, total=0;
888
889 f = fopen("data02.bin", "rb");
890 if(!f)
891 {
892 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
893 return U_FILE_ACCESS_ERROR;
894 }
895
896 out = fopen("data40.utf16", "wb");
897 if(!out)
898 {
899 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
900 fclose(f);
901 return U_FILE_ACCESS_ERROR;
902 }
903
904 // **************************** START SAMPLE *******************
905 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
906 assert(U_SUCCESS(status));
907
908 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
909 printf("input bytes %d / min chars %d = %d UChars\n",
910 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
911 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
912 assert(uBuf!=NULL);
913
914 // grab another buffer's worth
915 while((!feof(f)) &&
916 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
917 {
918 inbytes += count;
919
920 // Convert bytes to unicode
921 source = inBuf;
922 sourceLimit = inBuf + count;
923
924 do
925 {
926 target = uBuf;
927 targetLimit = uBuf + uBufSize;
928
929 ucnv_toUnicode( conv, &target, targetLimit,
930 &source, sourceLimit, NULL,
931 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
932 /* is true (when no more data will come) */
933 &status);
934
935 if(status == U_BUFFER_OVERFLOW_ERROR)
936 {
937 // simply ran out of space - we'll reset the target ptr the next
938 // time through the loop.
939 status = U_ZERO_ERROR;
940 }
941 else
942 {
943 // Check other errors here.
944 assert(U_SUCCESS(status));
945 // Break out of the loop (by force)
946 }
947
948 // Process the Unicode
949 // Todo: handle UTF-16/surrogates
950 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
951 (size_t)(target-uBuf));
952 total += (target-uBuf);
953 } while (source < sourceLimit); // while simply out of space
954 }
955
956 printf("%d bytes in, %d UChars out.\n", inbytes, total);
957
958 // ***************************** END SAMPLE ********************
959 ucnv_close(conv);
960
961 fclose(f);
962 fclose(out);
963 printf("\n");
964
965 return U_ZERO_ERROR;
966 }
967 #undef BUFFERSIZE
968
969
970
971 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
972
973 #define BUFFERSIZE 24 /* make it interesting :) */
974
convsample_46()975 UErrorCode convsample_46()
976 {
977 printf("\n\n==============================================\n"
978 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
979
980 FILE *f;
981 FILE *out;
982 int32_t count;
983 UChar inBuf[BUFFERSIZE];
984 const UChar *source;
985 const UChar *sourceLimit;
986 char *buf;
987 char *target;
988 char *targetLimit;
989
990 int32_t bufSize = 0;
991 UConverter *conv = NULL;
992 UErrorCode status = U_ZERO_ERROR;
993 uint32_t inchars=0, total=0;
994
995 f = fopen("data40.utf16", "rb");
996 if(!f)
997 {
998 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
999 return U_FILE_ACCESS_ERROR;
1000 }
1001
1002 out = fopen("data46.out", "wb");
1003 if(!out)
1004 {
1005 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1006 fclose(f);
1007 return U_FILE_ACCESS_ERROR;
1008 }
1009
1010 // **************************** START SAMPLE *******************
1011 conv = ucnv_open( "iso-8859-2", &status);
1012 assert(U_SUCCESS(status));
1013
1014 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1015 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1016 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1017 buf = (char*)malloc(bufSize * sizeof(char));
1018 assert(buf!=NULL);
1019
1020 // grab another buffer's worth
1021 while((!feof(f)) &&
1022 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1023 {
1024 inchars += count;
1025
1026 // Convert bytes to unicode
1027 source = inBuf;
1028 sourceLimit = inBuf + count;
1029
1030 do
1031 {
1032 target = buf;
1033 targetLimit = buf + bufSize;
1034
1035 ucnv_fromUnicode( conv, &target, targetLimit,
1036 &source, sourceLimit, NULL,
1037 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
1038 /* is true (when no more data will come) */
1039 &status);
1040
1041 if(status == U_BUFFER_OVERFLOW_ERROR)
1042 {
1043 // simply ran out of space - we'll reset the target ptr the next
1044 // time through the loop.
1045 status = U_ZERO_ERROR;
1046 }
1047 else
1048 {
1049 // Check other errors here.
1050 assert(U_SUCCESS(status));
1051 // Break out of the loop (by force)
1052 }
1053
1054 // Process the Unicode
1055 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1056 (size_t)(target-buf));
1057 total += (target-buf);
1058 } while (source < sourceLimit); // while simply out of space
1059 }
1060
1061 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1062
1063 // ***************************** END SAMPLE ********************
1064 ucnv_close(conv);
1065
1066 fclose(f);
1067 fclose(out);
1068 printf("\n");
1069
1070 return U_ZERO_ERROR;
1071 }
1072 #undef BUFFERSIZE
1073
1074 #define BUFFERSIZE 219
1075
1076
1077 /* main */
1078
main()1079 int main()
1080 {
1081
1082 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1083
1084 convsample_02(); // C , u->koi8r, conv
1085 convsample_03(); // C, iterate
1086
1087 convsample_05(); // C, utf8->u, getNextUChar
1088 convsample_06(); // C freq counter thingy
1089
1090 convsample_12(); // C, sjis->u, conv
1091 convsample_13(); // C, big5->u, getNextU
1092
1093 convsample_20(); // C, callback
1094 convsample_21(); // C, callback debug
1095
1096 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1097
1098 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1099
1100 printf("End of converter samples.\n");
1101
1102 fflush(stdout);
1103 fflush(stderr);
1104
1105 return 0;
1106 }
1107