1 /**************************************************************************
2 *
3 * Copyright (C) 2000-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ***************************************************************************
7 * file name: convsamp.c
8 * encoding: ASCII (7-bit)
9 *
10 * created on: 2000may30
11 * created by: Steven R. Loomis
12 *
13 * Sample code for the ICU conversion routines.
14 *
15 * Note: Nothing special is needed to build this sample. Link with
16 * the icu UC and icu I18N libraries.
17 *
18 * I use 'assert' for error checking, you probably will want
19 * something more flexible. '***BEGIN SAMPLE***' and
20 * '***END SAMPLE***' mark pieces suitable for stand alone
21 * code snippets.
22 *
23 *
24 * Each test can define it's own BUFFERSIZE
25 *
26 */
27
28 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
29
30 #include <stdio.h>
31 #include <ctype.h> /* for isspace, etc. */
32 #include <assert.h>
33 #include <string.h>
34 #include <stdlib.h> /* malloc */
35
36 #include "unicode/utypes.h" /* Basic ICU data types */
37 #include "unicode/ucnv.h" /* C Converter API */
38 #include "unicode/ustring.h" /* some more string fcns*/
39 #include "unicode/uchar.h" /* char names */
40 #include "unicode/uloc.h"
41 #include "unicode/unistr.h"
42
43 #include "flagcb.h"
44
45 /* Some utility functions */
46
47 static const UChar kNone[] = { 0x0000 };
48
49 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
50
51 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)52 void prettyPrintUChar(UChar c)
53 {
54 if( (c <= 0x007F) &&
55 (isgraph(c)) ) {
56 printf(" '%c' ", (char)(0x00FF&c));
57 } else if ( c > 0x007F ) {
58 char buf[1000];
59 UErrorCode status = U_ZERO_ERROR;
60 int32_t o;
61
62 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
63 if(U_SUCCESS(status) && (o>0) ) {
64 buf[6] = 0;
65 printf("%7s", buf);
66 } else {
67 printf(" ??????");
68 }
69 } else {
70 switch((char)(c & 0x007F)) {
71 case ' ':
72 printf(" ' ' ");
73 break;
74 case '\t':
75 printf(" \\t ");
76 break;
77 case '\n':
78 printf(" \\n ");
79 break;
80 default:
81 printf(" _ ");
82 break;
83 }
84 }
85 }
86
87
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)88 void printUChars(const char *name = "?",
89 const UChar *uch = kNone,
90 int32_t len = -1 )
91 {
92 int32_t i;
93
94 if( (len == -1) && (uch) ) {
95 len = u_strlen(uch);
96 }
97
98 printf("%5s: ", name);
99 for( i = 0; i <len; i++) {
100 printf("%-6d ", i);
101 }
102 printf("\n");
103
104 printf("%5s: ", "uni");
105 for( i = 0; i <len; i++) {
106 printf("\\u%04X ", (int)uch[i]);
107 }
108 printf("\n");
109
110 printf("%5s:", "ch");
111 for( i = 0; i <len; i++) {
112 prettyPrintUChar(uch[i]);
113 }
114 printf("\n");
115 }
116
printBytes(const char * name="?",const char * uch="",int32_t len=-1)117 void printBytes(const char *name = "?",
118 const char *uch = "",
119 int32_t len = -1 )
120 {
121 int32_t i;
122
123 if( (len == -1) && (uch) ) {
124 len = strlen(uch);
125 }
126
127 printf("%5s: ", name);
128 for( i = 0; i <len; i++) {
129 printf("%-4d ", i);
130 }
131 printf("\n");
132
133 printf("%5s: ", "uni");
134 for( i = 0; i <len; i++) {
135 printf("\\x%02X ", 0x00FF & (int)uch[i]);
136 }
137 printf("\n");
138
139 printf("%5s:", "ch");
140 for( i = 0; i <len; i++) {
141 if(isgraph(0x00FF & (int)uch[i])) {
142 printf(" '%c' ", (char)uch[i]);
143 } else {
144 printf(" ");
145 }
146 }
147 printf("\n");
148 }
149
printUChar(UChar32 ch32)150 void printUChar(UChar32 ch32)
151 {
152 if(ch32 > 0xFFFF) {
153 printf("ch: U+%06X\n", ch32);
154 }
155 else {
156 UChar ch = (UChar)ch32;
157 printUChars("C", &ch, 1);
158 }
159 }
160
161 /*******************************************************************
162 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
163 followed by an exclamation mark (!) into the KOI8-R Russian code page.
164
165 This example first creates a UChar String out of the Unicode chars.
166
167 targetSize must be set to the amount of space available in the target
168 buffer. After fromUChars is called,
169 len will contain the number of bytes in target[] which were
170 used in the resulting codepage. In this case, there is a 1:1 mapping
171 between the input and output characters. The exclamation mark has the
172 same value in both KOI8-R and Unicode.
173
174 src: 0 1 2 3 4 5 6
175 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
176 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
177
178 targ: 0 1 2 3 4 5 6
179 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
180 ch: '!'
181
182
183 Converting FROM unicode
184 to koi8-r.
185 You must call ucnv_close to clean up the memory used by the
186 converter.
187
188 'len' returns the number of OUTPUT bytes resulting from the
189 conversion.
190 */
191
convsample_02()192 UErrorCode convsample_02()
193 {
194 printf("\n\n==============================================\n"
195 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
196
197
198 // **************************** START SAMPLE *******************
199 // "cat<cat>OK"
200 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
201 0x0430, 0x0021, 0x0000 };
202 char target[100];
203 UErrorCode status = U_ZERO_ERROR;
204 UConverter *conv;
205 int32_t len;
206
207 // set up the converter
208 //! [ucnv_open]
209 conv = ucnv_open("koi8-r", &status);
210 //! [ucnv_open]
211 assert(U_SUCCESS(status));
212
213 // convert to koi8-r
214 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
215 assert(U_SUCCESS(status));
216
217 // close the converter
218 ucnv_close(conv);
219
220 // ***************************** END SAMPLE ********************
221
222 // Print it out
223 printUChars("src", source);
224 printf("\n");
225 printBytes("targ", target, len);
226
227 return U_ZERO_ERROR;
228 }
229
230
convsample_03()231 UErrorCode convsample_03()
232 {
233 printf("\n\n==============================================\n"
234 "Sample 03: C: print out all converters\n");
235
236 int32_t count;
237 int32_t i;
238
239 // **************************** START SAMPLE *******************
240 count = ucnv_countAvailable();
241 printf("Available converters: %d\n", count);
242
243 for(i=0;i<count;i++)
244 {
245 printf("%s ", ucnv_getAvailableName(i));
246 }
247
248 // ***************************** END SAMPLE ********************
249
250 printf("\n");
251
252 return U_ZERO_ERROR;
253 }
254
255
256
257 #define BUFFERSIZE 17 /* make it interesting :) */
258
259 /*
260 Converting from a codepage to Unicode in bulk..
261 What is the best way to determine the buffer size?
262
263 The 'buffersize' is in bytes of input.
264 For a given converter, divinding this by the minimum char size
265 give you the maximum number of Unicode characters that could be
266 expected for a given number of input bytes.
267 see: ucnv_getMinCharSize()
268
269 For example, a single byte codepage like 'Latin-3' has a
270 minimum char size of 1. (It takes at least 1 byte to represent
271 each Unicode char.) So the unicode buffer has the same number of
272 UChars as the input buffer has bytes.
273
274 In a strictly double byte codepage such as cp1362 (Windows
275 Korean), the minimum char size is 2. So, only half as many Unicode
276 chars as bytes are needed.
277
278 This work to calculate the buffer size is an optimization. Any
279 size of input and output buffer can be used, as long as the
280 program handles the following cases: If the input buffer is empty,
281 the source pointer will be equal to sourceLimit. If the output
282 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
283 */
284
convsample_05()285 UErrorCode convsample_05()
286 {
287 printf("\n\n==============================================\n"
288 "Sample 05: C: count the number of letters in a UTF-8 document\n");
289
290 FILE *f;
291 int32_t count;
292 char inBuf[BUFFERSIZE];
293 const char *source;
294 const char *sourceLimit;
295 UChar *uBuf;
296 UChar *target;
297 UChar *targetLimit;
298 UChar *p;
299 int32_t uBufSize = 0;
300 UConverter *conv;
301 UErrorCode status = U_ZERO_ERROR;
302 uint32_t letters=0, total=0;
303
304 f = fopen("data01.txt", "r");
305 if(!f)
306 {
307 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
308 return U_FILE_ACCESS_ERROR;
309 }
310
311 // **************************** START SAMPLE *******************
312 conv = ucnv_open("utf-8", &status);
313 assert(U_SUCCESS(status));
314
315 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
316 printf("input bytes %d / min chars %d = %d UChars\n",
317 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
318 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
319 assert(uBuf!=NULL);
320
321 // grab another buffer's worth
322 while((!feof(f)) &&
323 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
324 {
325 // Convert bytes to unicode
326 source = inBuf;
327 sourceLimit = inBuf + count;
328
329 do
330 {
331 target = uBuf;
332 targetLimit = uBuf + uBufSize;
333
334 ucnv_toUnicode(conv, &target, targetLimit,
335 &source, sourceLimit, NULL,
336 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
337 /* is true (when no more data will come) */
338 &status);
339
340 if(status == U_BUFFER_OVERFLOW_ERROR)
341 {
342 // simply ran out of space - we'll reset the target ptr the next
343 // time through the loop.
344 status = U_ZERO_ERROR;
345 }
346 else
347 {
348 // Check other errors here.
349 assert(U_SUCCESS(status));
350 // Break out of the loop (by force)
351 }
352
353 // Process the Unicode
354 // Todo: handle UTF-16/surrogates
355
356 for(p = uBuf; p<target; p++)
357 {
358 if(u_isalpha(*p))
359 letters++;
360 total++;
361 }
362 } while (source < sourceLimit); // while simply out of space
363 }
364
365 printf("%d letters out of %d total UChars.\n", letters, total);
366
367 // ***************************** END SAMPLE ********************
368 ucnv_close(conv);
369
370 printf("\n");
371
372 fclose(f);
373
374 return U_ZERO_ERROR;
375 }
376 #undef BUFFERSIZE
377
378 #define BUFFERSIZE 1024
379 typedef struct
380 {
381 UChar32 codepoint;
382 uint32_t frequency;
383 } CharFreqInfo;
384
convsample_06()385 UErrorCode convsample_06()
386 {
387 printf("\n\n==============================================\n"
388 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
389
390 FILE *f;
391 int32_t count;
392 char inBuf[BUFFERSIZE];
393 const char *source;
394 const char *sourceLimit;
395 int32_t uBufSize = 0;
396 UConverter *conv;
397 UErrorCode status = U_ZERO_ERROR;
398 uint32_t letters=0, total=0;
399
400 CharFreqInfo *info;
401 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
402 UChar32 p;
403
404 uint32_t ie = 0;
405 uint32_t gh = 0;
406 UChar32 l = 0;
407
408 f = fopen("data06.txt", "r");
409 if(!f)
410 {
411 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
412 return U_FILE_ACCESS_ERROR;
413 }
414
415 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
416 if(!info)
417 {
418 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
419 }
420
421 /* reset frequencies */
422 for(p=0;p<charCount;p++)
423 {
424 info[p].codepoint = p;
425 info[p].frequency = 0;
426 }
427
428 // **************************** START SAMPLE *******************
429 conv = ucnv_open("utf-8", &status);
430 assert(U_SUCCESS(status));
431
432 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
433 printf("input bytes %d / min chars %d = %d UChars\n",
434 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
435
436 // grab another buffer's worth
437 while((!feof(f)) &&
438 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
439 {
440 // Convert bytes to unicode
441 source = inBuf;
442 sourceLimit = inBuf + count;
443
444 while(source < sourceLimit)
445 {
446 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
447 if(U_FAILURE(status))
448 {
449 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
450 status = U_ZERO_ERROR;
451 continue;
452 }
453 U_ASSERT(status);
454 total++;
455
456 if(u_isalpha(p))
457 letters++;
458
459 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
460 ie++;
461
462 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
463 gh++;
464
465 if(p>charCount)
466 {
467 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
468 free(info);
469 fclose(f);
470 ucnv_close(conv);
471 return U_UNSUPPORTED_ERROR;
472 }
473 info[p].frequency++;
474 l = p;
475 }
476 }
477
478 fclose(f);
479 ucnv_close(conv);
480
481 printf("%d letters out of %d total UChars.\n", letters, total);
482 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
483
484 // now, we could sort it..
485
486 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
487
488 for(p=0;p<charCount;p++)
489 {
490 if(info[p].frequency)
491 {
492 printf("% 5d U+%06X ", info[p].frequency, p);
493 if(p <= 0xFFFF)
494 {
495 prettyPrintUChar((UChar)p);
496 }
497 printf("\n");
498 }
499 }
500 free(info);
501 // ***************************** END SAMPLE ********************
502
503 printf("\n");
504
505 return U_ZERO_ERROR;
506 }
507 #undef BUFFERSIZE
508
509
510 /******************************************************
511 You must call ucnv_close to clean up the memory used by the
512 converter.
513
514 'len' returns the number of OUTPUT bytes resulting from the
515 conversion.
516 */
517
convsample_12()518 UErrorCode convsample_12()
519 {
520 printf("\n\n==============================================\n"
521 "Sample 12: C: simple sjis -> unicode conversion\n");
522
523
524 // **************************** START SAMPLE *******************
525
526 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
527 UChar target[100];
528 UErrorCode status = U_ZERO_ERROR;
529 UConverter *conv;
530 int32_t len;
531
532 // set up the converter
533 conv = ucnv_open("shift_jis", &status);
534 assert(U_SUCCESS(status));
535
536 // convert to Unicode
537 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
538 target[6] = 0xFDCA;
539 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
540 U_ASSERT(status);
541 // close the converter
542 ucnv_close(conv);
543
544 // ***************************** END SAMPLE ********************
545
546 // Print it out
547 printBytes("src", source, strlen(source) );
548 printf("\n");
549 printUChars("targ", target, len);
550
551 return U_ZERO_ERROR;
552 }
553
554 /******************************************************************
555 C: Convert from codepage to Unicode one at a time.
556 */
557
convsample_13()558 UErrorCode convsample_13()
559 {
560 printf("\n\n==============================================\n"
561 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
562
563
564 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
565 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
566 const char *source, *sourceLimit;
567 UChar32 target;
568 UErrorCode status = U_ZERO_ERROR;
569 UConverter *conv = NULL;
570 int32_t srcCount=0;
571 int32_t dstCount=0;
572
573 srcCount = sizeof(sourceChars);
574
575 conv = ucnv_open("Big5", &status);
576 U_ASSERT(status);
577
578 source = sourceChars;
579 sourceLimit = sourceChars + sizeof(sourceChars);
580
581 // **************************** START SAMPLE *******************
582
583
584 printBytes("src",source,sourceLimit-source);
585
586 while(source < sourceLimit)
587 {
588 puts("");
589 target = ucnv_getNextUChar (conv,
590 &source,
591 sourceLimit,
592 &status);
593
594 // printBytes("src",source,sourceLimit-source);
595 U_ASSERT(status);
596 printUChar(target);
597 dstCount++;
598 }
599
600
601 // ************************** END SAMPLE *************************
602
603 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
604 ucnv_close(conv);
605
606 return U_ZERO_ERROR;
607 }
608
609
610
611
convsample_20_didSubstitute(const char * source)612 UBool convsample_20_didSubstitute(const char *source)
613 {
614 UChar uchars[100];
615 char bytes[100];
616 UConverter *conv = NULL;
617 UErrorCode status = U_ZERO_ERROR;
618 uint32_t len, len2;
619 UBool flagVal;
620
621 FromUFLAGContext * context = NULL;
622
623 printf("\n\n==============================================\n"
624 "Sample 20: C: Test for substitution using callbacks\n");
625
626 /* print out the original source */
627 printBytes("src", source);
628 printf("\n");
629
630 /* First, convert from UTF8 to unicode */
631 conv = ucnv_open("utf-8", &status);
632 U_ASSERT(status);
633
634 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
635 U_ASSERT(status);
636
637 printUChars("uch", uchars, len);
638 printf("\n");
639
640 /* Now, close the converter */
641 ucnv_close(conv);
642
643 /* Now, convert to windows-1252 */
644 conv = ucnv_open("windows-1252", &status);
645 U_ASSERT(status);
646
647 /* Converter starts out with the SUBSTITUTE callback set. */
648
649 /* initialize our callback */
650 context = flagCB_fromU_openContext();
651
652 /* Set our special callback */
653 ucnv_setFromUCallBack(conv,
654 flagCB_fromU,
655 context,
656 &(context->subCallback),
657 &(context->subContext),
658 &status);
659
660 U_ASSERT(status);
661
662 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
663 U_ASSERT(status);
664
665 flagVal = context->flag; /* it's about to go away when we close the cnv */
666
667 ucnv_close(conv);
668
669 /* print out the original source */
670 printBytes("bytes", bytes, len2);
671
672 return flagVal; /* true if callback was called */
673 }
674
convsample_20()675 UErrorCode convsample_20()
676 {
677 const char *sample1 = "abc\xdf\xbf";
678 const char *sample2 = "abc_def";
679
680
681 if(convsample_20_didSubstitute(sample1))
682 {
683 printf("DID substitute.\n******\n");
684 }
685 else
686 {
687 printf("Did NOT substitute.\n*****\n");
688 }
689
690 if(convsample_20_didSubstitute(sample2))
691 {
692 printf("DID substitute.\n******\n");
693 }
694 else
695 {
696 printf("Did NOT substitute.\n*****\n");
697 }
698
699 return U_ZERO_ERROR;
700 }
701
702 // 21 - C, callback, with clone and debug
703
704
705
convsample_21_didSubstitute(const char * source)706 UBool convsample_21_didSubstitute(const char *source)
707 {
708 UChar uchars[100];
709 char bytes[100];
710 UConverter *conv = NULL, *cloneCnv = NULL;
711 UErrorCode status = U_ZERO_ERROR;
712 uint32_t len, len2;
713 int32_t cloneLen;
714 UBool flagVal = FALSE;
715 UConverterFromUCallback junkCB;
716
717 FromUFLAGContext *flagCtx = NULL,
718 *cloneFlagCtx = NULL;
719
720 debugCBContext *debugCtx1 = NULL,
721 *debugCtx2 = NULL,
722 *cloneDebugCtx = NULL;
723
724 printf("\n\n==============================================\n"
725 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
726
727 /* print out the original source */
728 printBytes("src", source);
729 printf("\n");
730
731 /* First, convert from UTF8 to unicode */
732 conv = ucnv_open("utf-8", &status);
733 U_ASSERT(status);
734
735 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
736 U_ASSERT(status);
737
738 printUChars("uch", uchars, len);
739 printf("\n");
740
741 /* Now, close the converter */
742 ucnv_close(conv);
743
744 /* Now, convert to windows-1252 */
745 conv = ucnv_open("windows-1252", &status);
746 U_ASSERT(status);
747
748 /* Converter starts out with the SUBSTITUTE callback set. */
749
750 /* initialize our callback */
751 /* from the 'bottom' innermost, out
752 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
753
754 #if DEBUG_TMI
755 printf("flagCB_fromU = %p\n", &flagCB_fromU);
756 printf("debugCB_fromU = %p\n", &debugCB_fromU);
757 #endif
758
759 debugCtx1 = debugCB_openContext();
760 flagCtx = flagCB_fromU_openContext();
761 debugCtx2 = debugCB_openContext();
762
763 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
764 debugCtx1->subContext = flagCtx;
765
766 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
767 flagCtx->subContext = debugCtx2;
768
769 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
770 debugCtx2->subContext = NULL;
771
772 /* Set our special callback */
773
774 ucnv_setFromUCallBack(conv,
775 debugCB_fromU,
776 debugCtx1,
777 &(debugCtx2->subCallback),
778 &(debugCtx2->subContext),
779 &status);
780
781 U_ASSERT(status);
782
783 #if DEBUG_TMI
784 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
785 conv, debugCtx1, debugCtx1->subCallback,
786 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
787 #endif
788
789 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
790
791 U_ASSERT(status);
792
793 #if DEBUG_TMI
794 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
795 #endif
796
797 ucnv_close(conv);
798
799 #if DEBUG_TMI
800 printf("%p closed.\n", conv);
801 #endif
802
803 U_ASSERT(status);
804 /* Now, we have to extract the context */
805 cloneDebugCtx = NULL;
806 cloneFlagCtx = NULL;
807
808 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
809 if(cloneDebugCtx != NULL) {
810 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
811 }
812
813 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
814 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
815
816 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
817 U_ASSERT(status);
818
819 if(cloneFlagCtx != NULL) {
820 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
821 } else {
822 printf("** Warning, couldn't get the subcallback \n");
823 }
824
825 ucnv_close(cloneCnv);
826
827 /* print out the original source */
828 printBytes("bytes", bytes, len2);
829
830 return flagVal; /* true if callback was called */
831 }
832
convsample_21()833 UErrorCode convsample_21()
834 {
835 const char *sample1 = "abc\xdf\xbf";
836 const char *sample2 = "abc_def";
837
838 if(convsample_21_didSubstitute(sample1))
839 {
840 printf("DID substitute.\n******\n");
841 }
842 else
843 {
844 printf("Did NOT substitute.\n*****\n");
845 }
846
847 if(convsample_21_didSubstitute(sample2))
848 {
849 printf("DID substitute.\n******\n");
850 }
851 else
852 {
853 printf("Did NOT substitute.\n*****\n");
854 }
855
856 return U_ZERO_ERROR;
857 }
858
859
860 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
861
862 #define BUFFERSIZE 17 /* make it interesting :) */
863
convsample_40()864 UErrorCode convsample_40()
865 {
866 printf("\n\n==============================================\n"
867 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
868
869 FILE *f;
870 FILE *out;
871 int32_t count;
872 char inBuf[BUFFERSIZE];
873 const char *source;
874 const char *sourceLimit;
875 UChar *uBuf;
876 UChar *target;
877 UChar *targetLimit;
878 int32_t uBufSize = 0;
879 UConverter *conv = NULL;
880 UErrorCode status = U_ZERO_ERROR;
881 uint32_t inbytes=0, total=0;
882
883 f = fopen("data02.bin", "rb");
884 if(!f)
885 {
886 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
887 return U_FILE_ACCESS_ERROR;
888 }
889
890 out = fopen("data40.utf16", "wb");
891 if(!out)
892 {
893 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
894 fclose(f);
895 return U_FILE_ACCESS_ERROR;
896 }
897
898 // **************************** START SAMPLE *******************
899 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
900 assert(U_SUCCESS(status));
901
902 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
903 printf("input bytes %d / min chars %d = %d UChars\n",
904 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
905 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
906 assert(uBuf!=NULL);
907
908 // grab another buffer's worth
909 while((!feof(f)) &&
910 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
911 {
912 inbytes += count;
913
914 // Convert bytes to unicode
915 source = inBuf;
916 sourceLimit = inBuf + count;
917
918 do
919 {
920 target = uBuf;
921 targetLimit = uBuf + uBufSize;
922
923 ucnv_toUnicode( conv, &target, targetLimit,
924 &source, sourceLimit, NULL,
925 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
926 /* is true (when no more data will come) */
927 &status);
928
929 if(status == U_BUFFER_OVERFLOW_ERROR)
930 {
931 // simply ran out of space - we'll reset the target ptr the next
932 // time through the loop.
933 status = U_ZERO_ERROR;
934 }
935 else
936 {
937 // Check other errors here.
938 assert(U_SUCCESS(status));
939 // Break out of the loop (by force)
940 }
941
942 // Process the Unicode
943 // Todo: handle UTF-16/surrogates
944 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
945 (size_t)(target-uBuf));
946 total += (target-uBuf);
947 } while (source < sourceLimit); // while simply out of space
948 }
949
950 printf("%d bytes in, %d UChars out.\n", inbytes, total);
951
952 // ***************************** END SAMPLE ********************
953 ucnv_close(conv);
954
955 fclose(f);
956 fclose(out);
957 printf("\n");
958
959 return U_ZERO_ERROR;
960 }
961 #undef BUFFERSIZE
962
963
964
965 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
966
967 #define BUFFERSIZE 24 /* make it interesting :) */
968
convsample_46()969 UErrorCode convsample_46()
970 {
971 printf("\n\n==============================================\n"
972 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
973
974 FILE *f;
975 FILE *out;
976 int32_t count;
977 UChar inBuf[BUFFERSIZE];
978 const UChar *source;
979 const UChar *sourceLimit;
980 char *buf;
981 char *target;
982 char *targetLimit;
983
984 int32_t bufSize = 0;
985 UConverter *conv = NULL;
986 UErrorCode status = U_ZERO_ERROR;
987 uint32_t inchars=0, total=0;
988
989 f = fopen("data40.utf16", "rb");
990 if(!f)
991 {
992 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
993 return U_FILE_ACCESS_ERROR;
994 }
995
996 out = fopen("data46.out", "wb");
997 if(!out)
998 {
999 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1000 fclose(f);
1001 return U_FILE_ACCESS_ERROR;
1002 }
1003
1004 // **************************** START SAMPLE *******************
1005 conv = ucnv_open( "iso-8859-2", &status);
1006 assert(U_SUCCESS(status));
1007
1008 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1009 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1010 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1011 buf = (char*)malloc(bufSize * sizeof(char));
1012 assert(buf!=NULL);
1013
1014 // grab another buffer's worth
1015 while((!feof(f)) &&
1016 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1017 {
1018 inchars += count;
1019
1020 // Convert bytes to unicode
1021 source = inBuf;
1022 sourceLimit = inBuf + count;
1023
1024 do
1025 {
1026 target = buf;
1027 targetLimit = buf + bufSize;
1028
1029 ucnv_fromUnicode( conv, &target, targetLimit,
1030 &source, sourceLimit, NULL,
1031 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
1032 /* is true (when no more data will come) */
1033 &status);
1034
1035 if(status == U_BUFFER_OVERFLOW_ERROR)
1036 {
1037 // simply ran out of space - we'll reset the target ptr the next
1038 // time through the loop.
1039 status = U_ZERO_ERROR;
1040 }
1041 else
1042 {
1043 // Check other errors here.
1044 assert(U_SUCCESS(status));
1045 // Break out of the loop (by force)
1046 }
1047
1048 // Process the Unicode
1049 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1050 (size_t)(target-buf));
1051 total += (target-buf);
1052 } while (source < sourceLimit); // while simply out of space
1053 }
1054
1055 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1056
1057 // ***************************** END SAMPLE ********************
1058 ucnv_close(conv);
1059
1060 fclose(f);
1061 fclose(out);
1062 printf("\n");
1063
1064 return U_ZERO_ERROR;
1065 }
1066 #undef BUFFERSIZE
1067
1068 #define BUFFERSIZE 219
1069
convsample_50()1070 void convsample_50() {
1071 printf("\n\n==============================================\n"
1072 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1073
1074 //! [ucnv_detectUnicodeSignature]
1075 UErrorCode err = U_ZERO_ERROR;
1076 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1077 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1078 int32_t signatureLength = 0;
1079 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1080 UConverter *conv = NULL;
1081 UChar output[100];
1082 UChar *target = output, *out;
1083 const char *source = input;
1084 if(encoding!=NULL && U_SUCCESS(err)){
1085 // should signature be discarded ?
1086 conv = ucnv_open(encoding, &err);
1087 // do the conversion
1088 ucnv_toUnicode(conv,
1089 &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1090 &source, input + sizeof(input),
1091 NULL, TRUE, &err);
1092 out = output;
1093 if (discardSignature){
1094 ++out; // ignore initial U+FEFF
1095 }
1096 while(out != target) {
1097 printf("%04x ", *out++);
1098 }
1099 puts("");
1100 }
1101 //! [ucnv_detectUnicodeSignature]
1102 puts("");
1103 }
1104
1105
1106
1107 /* main */
1108
main()1109 int main()
1110 {
1111
1112 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1113
1114 convsample_02(); // C , u->koi8r, conv
1115 convsample_03(); // C, iterate
1116
1117 convsample_05(); // C, utf8->u, getNextUChar
1118 convsample_06(); // C freq counter thingy
1119
1120 convsample_12(); // C, sjis->u, conv
1121 convsample_13(); // C, big5->u, getNextU
1122
1123 convsample_20(); // C, callback
1124 convsample_21(); // C, callback debug
1125
1126 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1127
1128 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1129
1130 convsample_50(); // C, detect unicode signature
1131
1132 printf("End of converter samples.\n");
1133
1134 fflush(stdout);
1135 fflush(stderr);
1136
1137 return 0;
1138 }
1139