• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2 *
3 *   Copyright (C) 2000-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *
6 ***************************************************************************
7 *   file name:  convsamp.c
8 *   encoding:   ASCII (7-bit)
9 *
10 *   created on: 2000may30
11 *   created by: Steven R. Loomis
12 *
13 *   Sample code for the ICU conversion routines.
14 *
15 * Note: Nothing special is needed to build this sample. Link with
16 *       the icu UC and icu I18N libraries.
17 *
18 *       I use 'assert' for error checking, you probably will want
19 *       something more flexible.  '***BEGIN SAMPLE***' and
20 *       '***END SAMPLE***' mark pieces suitable for stand alone
21 *       code snippets.
22 *
23 *
24 *  Each test can define it's own BUFFERSIZE
25 *
26 */
27 
28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
29 
30 #include <stdio.h>
31 #include <ctype.h>            /* for isspace, etc.    */
32 #include <assert.h>
33 #include <string.h>
34 #include <stdlib.h>  /* malloc */
35 
36 #include "unicode/utypes.h"   /* Basic ICU data types */
37 #include "unicode/ucnv.h"     /* C   Converter API    */
38 #include "unicode/ustring.h"  /* some more string fcns*/
39 #include "unicode/uchar.h"    /* char names           */
40 #include "unicode/uloc.h"
41 #include "unicode/unistr.h"
42 
43 #include "flagcb.h"
44 
45 /* Some utility functions */
46 
47 static const UChar kNone[] = { 0x0000 };
48 
49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
50 
51 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)52 void prettyPrintUChar(UChar c)
53 {
54   if(  (c <= 0x007F) &&
55        (isgraph(c))  ) {
56     printf(" '%c'   ", (char)(0x00FF&c));
57   } else if ( c > 0x007F ) {
58     char buf[1000];
59     UErrorCode status = U_ZERO_ERROR;
60     int32_t o;
61 
62     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
63     if(U_SUCCESS(status) && (o>0) ) {
64       buf[6] = 0;
65       printf("%7s", buf);
66     } else {
67       printf(" ??????");
68     }
69   } else {
70     switch((char)(c & 0x007F)) {
71     case ' ':
72       printf(" ' '   ");
73       break;
74     case '\t':
75       printf(" \\t    ");
76       break;
77     case '\n':
78       printf(" \\n    ");
79       break;
80     default:
81       printf("  _    ");
82       break;
83     }
84   }
85 }
86 
87 
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)88 void printUChars(const char  *name = "?",
89                  const UChar *uch  = kNone,
90                  int32_t     len   = -1 )
91 {
92   int32_t i;
93 
94   if( (len == -1) && (uch) ) {
95     len = u_strlen(uch);
96   }
97 
98   printf("%5s: ", name);
99   for( i = 0; i <len; i++) {
100     printf("%-6d ", i);
101   }
102   printf("\n");
103 
104   printf("%5s: ", "uni");
105   for( i = 0; i <len; i++) {
106     printf("\\u%04X ", (int)uch[i]);
107   }
108   printf("\n");
109 
110   printf("%5s:", "ch");
111   for( i = 0; i <len; i++) {
112     prettyPrintUChar(uch[i]);
113   }
114   printf("\n");
115 }
116 
printBytes(const char * name="?",const char * uch="",int32_t len=-1)117 void printBytes(const char  *name = "?",
118                  const char *uch  = "",
119                  int32_t     len   = -1 )
120 {
121   int32_t i;
122 
123   if( (len == -1) && (uch) ) {
124     len = strlen(uch);
125   }
126 
127   printf("%5s: ", name);
128   for( i = 0; i <len; i++) {
129     printf("%-4d ", i);
130   }
131   printf("\n");
132 
133   printf("%5s: ", "uni");
134   for( i = 0; i <len; i++) {
135     printf("\\x%02X ", 0x00FF & (int)uch[i]);
136   }
137   printf("\n");
138 
139   printf("%5s:", "ch");
140   for( i = 0; i <len; i++) {
141     if(isgraph(0x00FF & (int)uch[i])) {
142       printf(" '%c' ", (char)uch[i]);
143     } else {
144       printf("     ");
145     }
146   }
147   printf("\n");
148 }
149 
printUChar(UChar32 ch32)150 void printUChar(UChar32 ch32)
151 {
152     if(ch32 > 0xFFFF) {
153       printf("ch: U+%06X\n", ch32);
154     }
155     else {
156       UChar ch = (UChar)ch32;
157       printUChars("C", &ch, 1);
158     }
159 }
160 
161 /*******************************************************************
162   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
163   followed by an exclamation mark (!) into the KOI8-R Russian code page.
164 
165   This example first creates a UChar String out of the Unicode chars.
166 
167   targetSize must be set to the amount of space available in the target
168   buffer. After fromUChars is called,
169   len will contain the number of bytes in target[] which were
170   used in the resulting codepage.  In this case, there is a 1:1 mapping
171   between the input and output characters. The exclamation mark has the
172   same value in both KOI8-R and Unicode.
173 
174   src: 0      1      2      3      4      5      6
175   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
176    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
177 
178  targ:  0    1    2    3    4    5    6
179   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
180    ch:                                '!'
181 
182 
183 Converting FROM unicode
184   to koi8-r.
185   You must call ucnv_close to clean up the memory used by the
186   converter.
187 
188   'len' returns the number of OUTPUT bytes resulting from the
189   conversion.
190  */
191 
convsample_02()192 UErrorCode convsample_02()
193 {
194   printf("\n\n==============================================\n"
195          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
196 
197 
198   // **************************** START SAMPLE *******************
199   // "cat<cat>OK"
200   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
201                      0x0430, 0x0021, 0x0000 };
202   char target[100];
203   UErrorCode status = U_ZERO_ERROR;
204   UConverter *conv;
205   int32_t     len;
206 
207   // set up the converter
208   //! [ucnv_open]
209   conv = ucnv_open("koi8-r", &status);
210   //! [ucnv_open]
211   assert(U_SUCCESS(status));
212 
213   // convert to koi8-r
214   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
215   assert(U_SUCCESS(status));
216 
217   // close the converter
218   ucnv_close(conv);
219 
220   // ***************************** END SAMPLE ********************
221 
222   // Print it out
223   printUChars("src", source);
224   printf("\n");
225   printBytes("targ", target, len);
226 
227   return U_ZERO_ERROR;
228 }
229 
230 
convsample_03()231 UErrorCode convsample_03()
232 {
233   printf("\n\n==============================================\n"
234          "Sample 03: C: print out all converters\n");
235 
236   int32_t count;
237   int32_t i;
238 
239   // **************************** START SAMPLE *******************
240   count = ucnv_countAvailable();
241   printf("Available converters: %d\n", count);
242 
243   for(i=0;i<count;i++)
244   {
245     printf("%s ", ucnv_getAvailableName(i));
246   }
247 
248   // ***************************** END SAMPLE ********************
249 
250   printf("\n");
251 
252   return U_ZERO_ERROR;
253 }
254 
255 
256 
257 #define BUFFERSIZE 17 /* make it interesting :) */
258 
259 /*
260   Converting from a codepage to Unicode in bulk..
261   What is the best way to determine the buffer size?
262 
263      The 'buffersize' is in bytes of input.
264     For a given converter, divinding this by the minimum char size
265     give you the maximum number of Unicode characters that could be
266     expected for a given number of input bytes.
267      see: ucnv_getMinCharSize()
268 
269      For example, a single byte codepage like 'Latin-3' has a
270     minimum char size of 1. (It takes at least 1 byte to represent
271     each Unicode char.) So the unicode buffer has the same number of
272     UChars as the input buffer has bytes.
273 
274      In a strictly double byte codepage such as cp1362 (Windows
275     Korean), the minimum char size is 2. So, only half as many Unicode
276     chars as bytes are needed.
277 
278      This work to calculate the buffer size is an optimization. Any
279     size of input and output buffer can be used, as long as the
280     program handles the following cases: If the input buffer is empty,
281     the source pointer will be equal to sourceLimit.  If the output
282     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
283  */
284 
convsample_05()285 UErrorCode convsample_05()
286 {
287   printf("\n\n==============================================\n"
288          "Sample 05: C: count the number of letters in a UTF-8 document\n");
289 
290   FILE *f;
291   int32_t count;
292   char inBuf[BUFFERSIZE];
293   const char *source;
294   const char *sourceLimit;
295   UChar *uBuf;
296   UChar *target;
297   UChar *targetLimit;
298   UChar *p;
299   int32_t uBufSize = 0;
300   UConverter *conv;
301   UErrorCode status = U_ZERO_ERROR;
302   uint32_t letters=0, total=0;
303 
304   f = fopen("data01.txt", "r");
305   if(!f)
306   {
307     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
308     return U_FILE_ACCESS_ERROR;
309   }
310 
311   // **************************** START SAMPLE *******************
312   conv = ucnv_open("utf-8", &status);
313   assert(U_SUCCESS(status));
314 
315   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
316   printf("input bytes %d / min chars %d = %d UChars\n",
317          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
318   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
319   assert(uBuf!=NULL);
320 
321   // grab another buffer's worth
322   while((!feof(f)) &&
323         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
324   {
325     // Convert bytes to unicode
326     source = inBuf;
327     sourceLimit = inBuf + count;
328 
329     do
330     {
331         target = uBuf;
332         targetLimit = uBuf + uBufSize;
333 
334         ucnv_toUnicode(conv, &target, targetLimit,
335                        &source, sourceLimit, NULL,
336                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
337                                    /* is true (when no more data will come) */
338                        &status);
339 
340         if(status == U_BUFFER_OVERFLOW_ERROR)
341         {
342           // simply ran out of space - we'll reset the target ptr the next
343           // time through the loop.
344           status = U_ZERO_ERROR;
345         }
346         else
347         {
348           //  Check other errors here.
349           assert(U_SUCCESS(status));
350           // Break out of the loop (by force)
351         }
352 
353         // Process the Unicode
354         // Todo: handle UTF-16/surrogates
355 
356         for(p = uBuf; p<target; p++)
357         {
358           if(u_isalpha(*p))
359             letters++;
360           total++;
361         }
362     } while (source < sourceLimit); // while simply out of space
363   }
364 
365   printf("%d letters out of %d total UChars.\n", letters, total);
366 
367   // ***************************** END SAMPLE ********************
368   ucnv_close(conv);
369 
370   printf("\n");
371 
372   fclose(f);
373 
374   return U_ZERO_ERROR;
375 }
376 #undef BUFFERSIZE
377 
378 #define BUFFERSIZE 1024
379 typedef struct
380 {
381   UChar32  codepoint;
382   uint32_t frequency;
383 } CharFreqInfo;
384 
convsample_06()385 UErrorCode convsample_06()
386 {
387   printf("\n\n==============================================\n"
388          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
389 
390   FILE *f;
391   int32_t count;
392   char inBuf[BUFFERSIZE];
393   const char *source;
394   const char *sourceLimit;
395   int32_t uBufSize = 0;
396   UConverter *conv;
397   UErrorCode status = U_ZERO_ERROR;
398   uint32_t letters=0, total=0;
399 
400   CharFreqInfo   *info;
401   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
402   UChar32   p;
403 
404   uint32_t ie = 0;
405   uint32_t gh = 0;
406   UChar32 l = 0;
407 
408   f = fopen("data06.txt", "r");
409   if(!f)
410   {
411     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
412     return U_FILE_ACCESS_ERROR;
413   }
414 
415   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
416   if(!info)
417   {
418     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
419   }
420 
421   /* reset frequencies */
422   for(p=0;p<charCount;p++)
423   {
424     info[p].codepoint = p;
425     info[p].frequency = 0;
426   }
427 
428   // **************************** START SAMPLE *******************
429   conv = ucnv_open("utf-8", &status);
430   assert(U_SUCCESS(status));
431 
432   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
433   printf("input bytes %d / min chars %d = %d UChars\n",
434          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
435 
436   // grab another buffer's worth
437   while((!feof(f)) &&
438         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
439   {
440     // Convert bytes to unicode
441     source = inBuf;
442     sourceLimit = inBuf + count;
443 
444     while(source < sourceLimit)
445     {
446       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
447       if(U_FAILURE(status))
448       {
449         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
450         status = U_ZERO_ERROR;
451         continue;
452       }
453       U_ASSERT(status);
454       total++;
455 
456       if(u_isalpha(p))
457         letters++;
458 
459       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
460         ie++;
461 
462       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
463         gh++;
464 
465       if(p>charCount)
466       {
467         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
468         free(info);
469         fclose(f);
470         ucnv_close(conv);
471         return U_UNSUPPORTED_ERROR;
472       }
473       info[p].frequency++;
474       l = p;
475     }
476   }
477 
478   fclose(f);
479   ucnv_close(conv);
480 
481   printf("%d letters out of %d total UChars.\n", letters, total);
482   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
483 
484   // now, we could sort it..
485 
486   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
487 
488   for(p=0;p<charCount;p++)
489   {
490     if(info[p].frequency)
491     {
492       printf("% 5d U+%06X ", info[p].frequency, p);
493       if(p <= 0xFFFF)
494       {
495         prettyPrintUChar((UChar)p);
496       }
497       printf("\n");
498     }
499   }
500   free(info);
501   // ***************************** END SAMPLE ********************
502 
503   printf("\n");
504 
505   return U_ZERO_ERROR;
506 }
507 #undef BUFFERSIZE
508 
509 
510 /******************************************************
511   You must call ucnv_close to clean up the memory used by the
512   converter.
513 
514   'len' returns the number of OUTPUT bytes resulting from the
515   conversion.
516  */
517 
convsample_12()518 UErrorCode convsample_12()
519 {
520   printf("\n\n==============================================\n"
521          "Sample 12: C: simple sjis -> unicode conversion\n");
522 
523 
524   // **************************** START SAMPLE *******************
525 
526   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
527   UChar target[100];
528   UErrorCode status = U_ZERO_ERROR;
529   UConverter *conv;
530   int32_t     len;
531 
532   // set up the converter
533   conv = ucnv_open("shift_jis", &status);
534   assert(U_SUCCESS(status));
535 
536   // convert to Unicode
537   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
538   target[6] = 0xFDCA;
539   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
540   U_ASSERT(status);
541   // close the converter
542   ucnv_close(conv);
543 
544   // ***************************** END SAMPLE ********************
545 
546   // Print it out
547   printBytes("src", source, strlen(source) );
548   printf("\n");
549   printUChars("targ", target, len);
550 
551   return U_ZERO_ERROR;
552 }
553 
554 /******************************************************************
555    C: Convert from codepage to Unicode one at a time.
556 */
557 
convsample_13()558 UErrorCode convsample_13()
559 {
560   printf("\n\n==============================================\n"
561          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
562 
563 
564   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
565   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
566   const char *source, *sourceLimit;
567   UChar32 target;
568   UErrorCode status = U_ZERO_ERROR;
569   UConverter *conv = NULL;
570   int32_t srcCount=0;
571   int32_t dstCount=0;
572 
573   srcCount = sizeof(sourceChars);
574 
575   conv = ucnv_open("Big5", &status);
576   U_ASSERT(status);
577 
578   source = sourceChars;
579   sourceLimit = sourceChars + sizeof(sourceChars);
580 
581   // **************************** START SAMPLE *******************
582 
583 
584   printBytes("src",source,sourceLimit-source);
585 
586   while(source < sourceLimit)
587   {
588     puts("");
589     target = ucnv_getNextUChar (conv,
590                                 &source,
591                                 sourceLimit,
592                                 &status);
593 
594     //    printBytes("src",source,sourceLimit-source);
595     U_ASSERT(status);
596     printUChar(target);
597     dstCount++;
598   }
599 
600 
601   // ************************** END SAMPLE *************************
602 
603   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
604   ucnv_close(conv);
605 
606   return U_ZERO_ERROR;
607 }
608 
609 
610 
611 
convsample_20_didSubstitute(const char * source)612 UBool convsample_20_didSubstitute(const char *source)
613 {
614   UChar uchars[100];
615   char bytes[100];
616   UConverter *conv = NULL;
617   UErrorCode status = U_ZERO_ERROR;
618   uint32_t len, len2;
619   UBool  flagVal;
620 
621   FromUFLAGContext * context = NULL;
622 
623   printf("\n\n==============================================\n"
624          "Sample 20: C: Test for substitution using callbacks\n");
625 
626   /* print out the original source */
627   printBytes("src", source);
628   printf("\n");
629 
630   /* First, convert from UTF8 to unicode */
631   conv = ucnv_open("utf-8", &status);
632   U_ASSERT(status);
633 
634   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
635   U_ASSERT(status);
636 
637   printUChars("uch", uchars, len);
638   printf("\n");
639 
640   /* Now, close the converter */
641   ucnv_close(conv);
642 
643   /* Now, convert to windows-1252 */
644   conv = ucnv_open("windows-1252", &status);
645   U_ASSERT(status);
646 
647   /* Converter starts out with the SUBSTITUTE callback set. */
648 
649   /* initialize our callback */
650   context = flagCB_fromU_openContext();
651 
652   /* Set our special callback */
653   ucnv_setFromUCallBack(conv,
654                         flagCB_fromU,
655                         context,
656                         &(context->subCallback),
657                         &(context->subContext),
658                         &status);
659 
660   U_ASSERT(status);
661 
662   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
663   U_ASSERT(status);
664 
665   flagVal = context->flag;  /* it's about to go away when we close the cnv */
666 
667   ucnv_close(conv);
668 
669   /* print out the original source */
670   printBytes("bytes", bytes, len2);
671 
672   return flagVal; /* true if callback was called */
673 }
674 
convsample_20()675 UErrorCode convsample_20()
676 {
677   const char *sample1 = "abc\xdf\xbf";
678   const char *sample2 = "abc_def";
679 
680 
681   if(convsample_20_didSubstitute(sample1))
682   {
683     printf("DID substitute.\n******\n");
684   }
685   else
686   {
687     printf("Did NOT substitute.\n*****\n");
688   }
689 
690   if(convsample_20_didSubstitute(sample2))
691   {
692     printf("DID substitute.\n******\n");
693   }
694   else
695   {
696     printf("Did NOT substitute.\n*****\n");
697   }
698 
699   return U_ZERO_ERROR;
700 }
701 
702 // 21  - C, callback, with clone and debug
703 
704 
705 
convsample_21_didSubstitute(const char * source)706 UBool convsample_21_didSubstitute(const char *source)
707 {
708   UChar uchars[100];
709   char bytes[100];
710   UConverter *conv = NULL, *cloneCnv = NULL;
711   UErrorCode status = U_ZERO_ERROR;
712   uint32_t len, len2;
713   int32_t  cloneLen;
714   UBool  flagVal = FALSE;
715   UConverterFromUCallback junkCB;
716 
717   FromUFLAGContext *flagCtx = NULL,
718                    *cloneFlagCtx = NULL;
719 
720   debugCBContext   *debugCtx1 = NULL,
721                    *debugCtx2 = NULL,
722                    *cloneDebugCtx = NULL;
723 
724   printf("\n\n==============================================\n"
725          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
726 
727   /* print out the original source */
728   printBytes("src", source);
729   printf("\n");
730 
731   /* First, convert from UTF8 to unicode */
732   conv = ucnv_open("utf-8", &status);
733   U_ASSERT(status);
734 
735   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
736   U_ASSERT(status);
737 
738   printUChars("uch", uchars, len);
739   printf("\n");
740 
741   /* Now, close the converter */
742   ucnv_close(conv);
743 
744   /* Now, convert to windows-1252 */
745   conv = ucnv_open("windows-1252", &status);
746   U_ASSERT(status);
747 
748   /* Converter starts out with the SUBSTITUTE callback set. */
749 
750   /* initialize our callback */
751   /* from the 'bottom' innermost, out
752    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
753 
754 #if DEBUG_TMI
755   printf("flagCB_fromU = %p\n", &flagCB_fromU);
756   printf("debugCB_fromU = %p\n", &debugCB_fromU);
757 #endif
758 
759   debugCtx1 = debugCB_openContext();
760    flagCtx  = flagCB_fromU_openContext();
761   debugCtx2 = debugCB_openContext();
762 
763   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
764   debugCtx1->subContext  =  flagCtx;
765 
766   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
767   flagCtx->subContext    =  debugCtx2;
768 
769   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
770   debugCtx2->subContext  = NULL;
771 
772   /* Set our special callback */
773 
774   ucnv_setFromUCallBack(conv,
775                         debugCB_fromU,
776                         debugCtx1,
777                         &(debugCtx2->subCallback),
778                         &(debugCtx2->subContext),
779                         &status);
780 
781   U_ASSERT(status);
782 
783 #if DEBUG_TMI
784   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
785          conv, debugCtx1, debugCtx1->subCallback,
786          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
787 #endif
788 
789   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
790 
791   U_ASSERT(status);
792 
793 #if DEBUG_TMI
794   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
795 #endif
796 
797   ucnv_close(conv);
798 
799 #if DEBUG_TMI
800   printf("%p closed.\n", conv);
801 #endif
802 
803   U_ASSERT(status);
804   /* Now, we have to extract the context */
805   cloneDebugCtx = NULL;
806   cloneFlagCtx  = NULL;
807 
808   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
809   if(cloneDebugCtx != NULL) {
810       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
811   }
812 
813   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
814          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
815 
816   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
817   U_ASSERT(status);
818 
819   if(cloneFlagCtx != NULL) {
820       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
821   } else {
822       printf("** Warning, couldn't get the subcallback \n");
823   }
824 
825   ucnv_close(cloneCnv);
826 
827   /* print out the original source */
828   printBytes("bytes", bytes, len2);
829 
830   return flagVal; /* true if callback was called */
831 }
832 
convsample_21()833 UErrorCode convsample_21()
834 {
835   const char *sample1 = "abc\xdf\xbf";
836   const char *sample2 = "abc_def";
837 
838   if(convsample_21_didSubstitute(sample1))
839   {
840     printf("DID substitute.\n******\n");
841   }
842   else
843   {
844     printf("Did NOT substitute.\n*****\n");
845   }
846 
847   if(convsample_21_didSubstitute(sample2))
848   {
849     printf("DID substitute.\n******\n");
850   }
851   else
852   {
853     printf("Did NOT substitute.\n*****\n");
854   }
855 
856   return U_ZERO_ERROR;
857 }
858 
859 
860 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
861 
862 #define BUFFERSIZE 17 /* make it interesting :) */
863 
convsample_40()864 UErrorCode convsample_40()
865 {
866   printf("\n\n==============================================\n"
867     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
868 
869   FILE *f;
870   FILE *out;
871   int32_t count;
872   char inBuf[BUFFERSIZE];
873   const char *source;
874   const char *sourceLimit;
875   UChar *uBuf;
876   UChar *target;
877   UChar *targetLimit;
878   int32_t uBufSize = 0;
879   UConverter *conv = NULL;
880   UErrorCode status = U_ZERO_ERROR;
881   uint32_t inbytes=0, total=0;
882 
883   f = fopen("data02.bin", "rb");
884   if(!f)
885   {
886     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
887     return U_FILE_ACCESS_ERROR;
888   }
889 
890   out = fopen("data40.utf16", "wb");
891   if(!out)
892   {
893     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
894     fclose(f);
895     return U_FILE_ACCESS_ERROR;
896   }
897 
898   // **************************** START SAMPLE *******************
899   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
900   assert(U_SUCCESS(status));
901 
902   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
903   printf("input bytes %d / min chars %d = %d UChars\n",
904          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
905   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
906   assert(uBuf!=NULL);
907 
908   // grab another buffer's worth
909   while((!feof(f)) &&
910         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
911   {
912     inbytes += count;
913 
914     // Convert bytes to unicode
915     source = inBuf;
916     sourceLimit = inBuf + count;
917 
918     do
919     {
920         target = uBuf;
921         targetLimit = uBuf + uBufSize;
922 
923         ucnv_toUnicode( conv, &target, targetLimit,
924                        &source, sourceLimit, NULL,
925                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
926                                    /* is true (when no more data will come) */
927                          &status);
928 
929         if(status == U_BUFFER_OVERFLOW_ERROR)
930         {
931           // simply ran out of space - we'll reset the target ptr the next
932           // time through the loop.
933           status = U_ZERO_ERROR;
934         }
935         else
936         {
937           //  Check other errors here.
938           assert(U_SUCCESS(status));
939           // Break out of the loop (by force)
940         }
941 
942         // Process the Unicode
943         // Todo: handle UTF-16/surrogates
944         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
945                (size_t)(target-uBuf));
946         total += (target-uBuf);
947     } while (source < sourceLimit); // while simply out of space
948   }
949 
950   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
951 
952   // ***************************** END SAMPLE ********************
953   ucnv_close(conv);
954 
955   fclose(f);
956   fclose(out);
957   printf("\n");
958 
959   return U_ZERO_ERROR;
960 }
961 #undef BUFFERSIZE
962 
963 
964 
965 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
966 
967 #define BUFFERSIZE 24 /* make it interesting :) */
968 
convsample_46()969 UErrorCode convsample_46()
970 {
971   printf("\n\n==============================================\n"
972     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
973 
974   FILE *f;
975   FILE *out;
976   int32_t count;
977   UChar inBuf[BUFFERSIZE];
978   const UChar *source;
979   const UChar *sourceLimit;
980   char *buf;
981   char *target;
982   char *targetLimit;
983 
984   int32_t bufSize = 0;
985   UConverter *conv = NULL;
986   UErrorCode status = U_ZERO_ERROR;
987   uint32_t inchars=0, total=0;
988 
989   f = fopen("data40.utf16", "rb");
990   if(!f)
991   {
992     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
993     return U_FILE_ACCESS_ERROR;
994   }
995 
996   out = fopen("data46.out", "wb");
997   if(!out)
998   {
999     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1000     fclose(f);
1001     return U_FILE_ACCESS_ERROR;
1002   }
1003 
1004   // **************************** START SAMPLE *******************
1005   conv = ucnv_open( "iso-8859-2", &status);
1006   assert(U_SUCCESS(status));
1007 
1008   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1009   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1010          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1011   buf = (char*)malloc(bufSize * sizeof(char));
1012   assert(buf!=NULL);
1013 
1014   // grab another buffer's worth
1015   while((!feof(f)) &&
1016         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1017   {
1018     inchars += count;
1019 
1020     // Convert bytes to unicode
1021     source = inBuf;
1022     sourceLimit = inBuf + count;
1023 
1024     do
1025     {
1026         target = buf;
1027         targetLimit = buf + bufSize;
1028 
1029         ucnv_fromUnicode( conv, &target, targetLimit,
1030                        &source, sourceLimit, NULL,
1031                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1032                                    /* is true (when no more data will come) */
1033                          &status);
1034 
1035         if(status == U_BUFFER_OVERFLOW_ERROR)
1036         {
1037           // simply ran out of space - we'll reset the target ptr the next
1038           // time through the loop.
1039           status = U_ZERO_ERROR;
1040         }
1041         else
1042         {
1043           //  Check other errors here.
1044           assert(U_SUCCESS(status));
1045           // Break out of the loop (by force)
1046         }
1047 
1048         // Process the Unicode
1049         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1050                (size_t)(target-buf));
1051         total += (target-buf);
1052     } while (source < sourceLimit); // while simply out of space
1053   }
1054 
1055   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1056 
1057   // ***************************** END SAMPLE ********************
1058   ucnv_close(conv);
1059 
1060   fclose(f);
1061   fclose(out);
1062   printf("\n");
1063 
1064   return U_ZERO_ERROR;
1065 }
1066 #undef BUFFERSIZE
1067 
1068 #define BUFFERSIZE 219
1069 
convsample_50()1070 void convsample_50() {
1071   printf("\n\n==============================================\n"
1072          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1073 
1074   //! [ucnv_detectUnicodeSignature]
1075   UErrorCode err = U_ZERO_ERROR;
1076   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1077   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1078   int32_t signatureLength = 0;
1079   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1080   UConverter *conv = NULL;
1081   UChar output[100];
1082   UChar *target = output, *out;
1083   const char *source = input;
1084   if(encoding!=NULL && U_SUCCESS(err)){
1085     // should signature be discarded ?
1086     conv = ucnv_open(encoding, &err);
1087     // do the conversion
1088     ucnv_toUnicode(conv,
1089                    &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1090                    &source, input + sizeof(input),
1091                    NULL, TRUE, &err);
1092     out = output;
1093     if (discardSignature){
1094       ++out; // ignore initial U+FEFF
1095     }
1096     while(out != target) {
1097       printf("%04x ", *out++);
1098     }
1099     puts("");
1100   }
1101   //! [ucnv_detectUnicodeSignature]
1102   puts("");
1103 }
1104 
1105 
1106 
1107 /* main */
1108 
main()1109 int main()
1110 {
1111 
1112   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1113 
1114   convsample_02();  // C  , u->koi8r, conv
1115   convsample_03();  // C,   iterate
1116 
1117   convsample_05();  // C,  utf8->u, getNextUChar
1118   convsample_06(); // C freq counter thingy
1119 
1120   convsample_12();  // C,  sjis->u, conv
1121   convsample_13();  // C,  big5->u, getNextU
1122 
1123   convsample_20();  // C, callback
1124   convsample_21();  // C, callback debug
1125 
1126   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1127 
1128   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1129 
1130   convsample_50();  // C, detect unicode signature
1131 
1132   printf("End of converter samples.\n");
1133 
1134   fflush(stdout);
1135   fflush(stderr);
1136 
1137   return 0;
1138 }
1139