1 /*************************************************************************
2 *
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 * Copyright (C) 2000-2016, International Business Machines
10 * Corporation and others. All Rights Reserved.
11 *
12 ***************************************************************************
13 * file name: convsamp.c
14 * encoding: ASCII (7-bit)
15 *
16 * created on: 2000may30
17 * created by: Steven R. Loomis
18 *
19 * Sample code for the ICU conversion routines.
20 *
21 * Note: Nothing special is needed to build this sample. Link with
22 * the icu UC and icu I18N libraries.
23 *
24 * I use 'assert' for error checking, you probably will want
25 * something more flexible. '***BEGIN SAMPLE***' and
26 * '***END SAMPLE***' mark pieces suitable for stand alone
27 * code snippets.
28 *
29 *
30 * Each test can define it's own BUFFERSIZE
31 *
32 */
33
34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
35
36 #include <stdio.h>
37 #include <ctype.h> /* for isspace, etc. */
38 #include <assert.h>
39 #include <string.h>
40 #include <stdlib.h> /* malloc */
41
42 #include "unicode/utypes.h" /* Basic ICU data types */
43 #include "unicode/ucnv.h" /* C Converter API */
44 #include "unicode/ustring.h" /* some more string fcns*/
45 #include "unicode/uchar.h" /* char names */
46 #include "unicode/uloc.h"
47 #include "unicode/unistr.h"
48
49 #include "flagcb.h"
50
51 /* Some utility functions */
52 #ifndef UPRV_LENGTHOF
53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
54 #endif
55
56 static const UChar kNone[] = { 0x0000 };
57
58 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
59
60 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)61 void prettyPrintUChar(UChar c)
62 {
63 if( (c <= 0x007F) &&
64 (isgraph(c)) ) {
65 printf(" '%c' ", (char)(0x00FF&c));
66 } else if ( c > 0x007F ) {
67 char buf[1000];
68 UErrorCode status = U_ZERO_ERROR;
69 int32_t o;
70
71 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
72 if(U_SUCCESS(status) && (o>0) ) {
73 buf[6] = 0;
74 printf("%7s", buf);
75 } else {
76 printf(" ??????");
77 }
78 } else {
79 switch((char)(c & 0x007F)) {
80 case ' ':
81 printf(" ' ' ");
82 break;
83 case '\t':
84 printf(" \\t ");
85 break;
86 case '\n':
87 printf(" \\n ");
88 break;
89 default:
90 printf(" _ ");
91 break;
92 }
93 }
94 }
95
96
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)97 void printUChars(const char *name = "?",
98 const UChar *uch = kNone,
99 int32_t len = -1 )
100 {
101 int32_t i;
102
103 if( (len == -1) && (uch) ) {
104 len = u_strlen(uch);
105 }
106
107 printf("%5s: ", name);
108 for( i = 0; i <len; i++) {
109 printf("%-6d ", i);
110 }
111 printf("\n");
112
113 printf("%5s: ", "uni");
114 for( i = 0; i <len; i++) {
115 printf("\\u%04X ", (int)uch[i]);
116 }
117 printf("\n");
118
119 printf("%5s:", "ch");
120 for( i = 0; i <len; i++) {
121 prettyPrintUChar(uch[i]);
122 }
123 printf("\n");
124 }
125
printBytes(const char * name="?",const char * uch="",int32_t len=-1)126 void printBytes(const char *name = "?",
127 const char *uch = "",
128 int32_t len = -1 )
129 {
130 int32_t i;
131
132 if( (len == -1) && (uch) ) {
133 len = strlen(uch);
134 }
135
136 printf("%5s: ", name);
137 for( i = 0; i <len; i++) {
138 printf("%-4d ", i);
139 }
140 printf("\n");
141
142 printf("%5s: ", "uni");
143 for( i = 0; i <len; i++) {
144 printf("\\x%02X ", 0x00FF & (int)uch[i]);
145 }
146 printf("\n");
147
148 printf("%5s:", "ch");
149 for( i = 0; i <len; i++) {
150 if(isgraph(0x00FF & (int)uch[i])) {
151 printf(" '%c' ", (char)uch[i]);
152 } else {
153 printf(" ");
154 }
155 }
156 printf("\n");
157 }
158
printUChar(UChar32 ch32)159 void printUChar(UChar32 ch32)
160 {
161 if(ch32 > 0xFFFF) {
162 printf("ch: U+%06X\n", ch32);
163 }
164 else {
165 UChar ch = (UChar)ch32;
166 printUChars("C", &ch, 1);
167 }
168 }
169
170 /*******************************************************************
171 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
172 followed by an exclamation mark (!) into the KOI8-R Russian code page.
173
174 This example first creates a UChar String out of the Unicode chars.
175
176 targetSize must be set to the amount of space available in the target
177 buffer. After fromUChars is called,
178 len will contain the number of bytes in target[] which were
179 used in the resulting codepage. In this case, there is a 1:1 mapping
180 between the input and output characters. The exclamation mark has the
181 same value in both KOI8-R and Unicode.
182
183 src: 0 1 2 3 4 5 6
184 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
185 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
186
187 targ: 0 1 2 3 4 5 6
188 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
189 ch: '!'
190
191
192 Converting FROM unicode
193 to koi8-r.
194 You must call ucnv_close to clean up the memory used by the
195 converter.
196
197 'len' returns the number of OUTPUT bytes resulting from the
198 conversion.
199 */
200
convsample_02()201 UErrorCode convsample_02()
202 {
203 printf("\n\n==============================================\n"
204 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
205
206
207 // **************************** START SAMPLE *******************
208 // "cat<cat>OK"
209 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
210 0x0430, 0x0021, 0x0000 };
211 char target[100];
212 UErrorCode status = U_ZERO_ERROR;
213 UConverter *conv;
214 int32_t len;
215
216 // set up the converter
217 //! [ucnv_open]
218 conv = ucnv_open("koi8-r", &status);
219 //! [ucnv_open]
220 assert(U_SUCCESS(status));
221
222 // convert to koi8-r
223 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
224 assert(U_SUCCESS(status));
225
226 // close the converter
227 ucnv_close(conv);
228
229 // ***************************** END SAMPLE ********************
230
231 // Print it out
232 printUChars("src", source);
233 printf("\n");
234 printBytes("targ", target, len);
235
236 return U_ZERO_ERROR;
237 }
238
239
convsample_03()240 UErrorCode convsample_03()
241 {
242 printf("\n\n==============================================\n"
243 "Sample 03: C: print out all converters\n");
244
245 int32_t count;
246 int32_t i;
247
248 // **************************** START SAMPLE *******************
249 count = ucnv_countAvailable();
250 printf("Available converters: %d\n", count);
251
252 for(i=0;i<count;i++)
253 {
254 printf("%s ", ucnv_getAvailableName(i));
255 }
256
257 // ***************************** END SAMPLE ********************
258
259 printf("\n");
260
261 return U_ZERO_ERROR;
262 }
263
264
265
266 #define BUFFERSIZE 17 /* make it interesting :) */
267
268 /*
269 Converting from a codepage to Unicode in bulk..
270 What is the best way to determine the buffer size?
271
272 The 'buffersize' is in bytes of input.
273 For a given converter, divinding this by the minimum char size
274 give you the maximum number of Unicode characters that could be
275 expected for a given number of input bytes.
276 see: ucnv_getMinCharSize()
277
278 For example, a single byte codepage like 'Latin-3' has a
279 minimum char size of 1. (It takes at least 1 byte to represent
280 each Unicode char.) So the unicode buffer has the same number of
281 UChars as the input buffer has bytes.
282
283 In a strictly double byte codepage such as cp1362 (Windows
284 Korean), the minimum char size is 2. So, only half as many Unicode
285 chars as bytes are needed.
286
287 This work to calculate the buffer size is an optimization. Any
288 size of input and output buffer can be used, as long as the
289 program handles the following cases: If the input buffer is empty,
290 the source pointer will be equal to sourceLimit. If the output
291 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
292 */
293
convsample_05()294 UErrorCode convsample_05()
295 {
296 printf("\n\n==============================================\n"
297 "Sample 05: C: count the number of letters in a UTF-8 document\n");
298
299 FILE *f;
300 int32_t count;
301 char inBuf[BUFFERSIZE];
302 const char *source;
303 const char *sourceLimit;
304 UChar *uBuf;
305 UChar *target;
306 UChar *targetLimit;
307 UChar *p;
308 int32_t uBufSize = 0;
309 UConverter *conv;
310 UErrorCode status = U_ZERO_ERROR;
311 uint32_t letters=0, total=0;
312
313 f = fopen("data01.txt", "r");
314 if(!f)
315 {
316 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
317 return U_FILE_ACCESS_ERROR;
318 }
319
320 // **************************** START SAMPLE *******************
321 conv = ucnv_open("utf-8", &status);
322 assert(U_SUCCESS(status));
323
324 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
325 printf("input bytes %d / min chars %d = %d UChars\n",
326 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
327 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
328 assert(uBuf!=NULL);
329
330 // grab another buffer's worth
331 while((!feof(f)) &&
332 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
333 {
334 // Convert bytes to unicode
335 source = inBuf;
336 sourceLimit = inBuf + count;
337
338 do
339 {
340 target = uBuf;
341 targetLimit = uBuf + uBufSize;
342
343 ucnv_toUnicode(conv, &target, targetLimit,
344 &source, sourceLimit, NULL,
345 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
346 /* is true (when no more data will come) */
347 &status);
348
349 if(status == U_BUFFER_OVERFLOW_ERROR)
350 {
351 // simply ran out of space - we'll reset the target ptr the next
352 // time through the loop.
353 status = U_ZERO_ERROR;
354 }
355 else
356 {
357 // Check other errors here.
358 assert(U_SUCCESS(status));
359 // Break out of the loop (by force)
360 }
361
362 // Process the Unicode
363 // Todo: handle UTF-16/surrogates
364
365 for(p = uBuf; p<target; p++)
366 {
367 if(u_isalpha(*p))
368 letters++;
369 total++;
370 }
371 } while (source < sourceLimit); // while simply out of space
372 }
373
374 printf("%d letters out of %d total UChars.\n", letters, total);
375
376 // ***************************** END SAMPLE ********************
377 ucnv_close(conv);
378
379 printf("\n");
380
381 fclose(f);
382
383 return U_ZERO_ERROR;
384 }
385 #undef BUFFERSIZE
386
387 #define BUFFERSIZE 1024
388 typedef struct
389 {
390 UChar32 codepoint;
391 uint32_t frequency;
392 } CharFreqInfo;
393
convsample_06()394 UErrorCode convsample_06()
395 {
396 printf("\n\n==============================================\n"
397 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
398
399 FILE *f;
400 int32_t count;
401 char inBuf[BUFFERSIZE];
402 const char *source;
403 const char *sourceLimit;
404 int32_t uBufSize = 0;
405 UConverter *conv;
406 UErrorCode status = U_ZERO_ERROR;
407 uint32_t letters=0, total=0;
408
409 CharFreqInfo *info;
410 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
411 UChar32 p;
412
413 uint32_t ie = 0;
414 uint32_t gh = 0;
415 UChar32 l = 0;
416
417 f = fopen("data06.txt", "r");
418 if(!f)
419 {
420 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
421 return U_FILE_ACCESS_ERROR;
422 }
423
424 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
425 if(!info)
426 {
427 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
428 }
429
430 /* reset frequencies */
431 for(p=0;p<charCount;p++)
432 {
433 info[p].codepoint = p;
434 info[p].frequency = 0;
435 }
436
437 // **************************** START SAMPLE *******************
438 conv = ucnv_open("utf-8", &status);
439 assert(U_SUCCESS(status));
440
441 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
442 printf("input bytes %d / min chars %d = %d UChars\n",
443 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
444
445 // grab another buffer's worth
446 while((!feof(f)) &&
447 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
448 {
449 // Convert bytes to unicode
450 source = inBuf;
451 sourceLimit = inBuf + count;
452
453 while(source < sourceLimit)
454 {
455 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
456 if(U_FAILURE(status))
457 {
458 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
459 status = U_ZERO_ERROR;
460 continue;
461 }
462 U_ASSERT(status);
463 total++;
464
465 if(u_isalpha(p))
466 letters++;
467
468 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
469 ie++;
470
471 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
472 gh++;
473
474 if(p>charCount)
475 {
476 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
477 free(info);
478 fclose(f);
479 ucnv_close(conv);
480 return U_UNSUPPORTED_ERROR;
481 }
482 info[p].frequency++;
483 l = p;
484 }
485 }
486
487 fclose(f);
488 ucnv_close(conv);
489
490 printf("%d letters out of %d total UChars.\n", letters, total);
491 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
492
493 // now, we could sort it..
494
495 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
496
497 for(p=0;p<charCount;p++)
498 {
499 if(info[p].frequency)
500 {
501 printf("% 5d U+%06X ", info[p].frequency, p);
502 if(p <= 0xFFFF)
503 {
504 prettyPrintUChar((UChar)p);
505 }
506 printf("\n");
507 }
508 }
509 free(info);
510 // ***************************** END SAMPLE ********************
511
512 printf("\n");
513
514 return U_ZERO_ERROR;
515 }
516 #undef BUFFERSIZE
517
518
519 /******************************************************
520 You must call ucnv_close to clean up the memory used by the
521 converter.
522
523 'len' returns the number of OUTPUT bytes resulting from the
524 conversion.
525 */
526
convsample_12()527 UErrorCode convsample_12()
528 {
529 printf("\n\n==============================================\n"
530 "Sample 12: C: simple sjis -> unicode conversion\n");
531
532
533 // **************************** START SAMPLE *******************
534
535 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
536 UChar target[100];
537 UErrorCode status = U_ZERO_ERROR;
538 UConverter *conv;
539 int32_t len;
540
541 // set up the converter
542 conv = ucnv_open("shift_jis", &status);
543 assert(U_SUCCESS(status));
544
545 // convert to Unicode
546 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
547 target[6] = 0xFDCA;
548 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
549 U_ASSERT(status);
550 // close the converter
551 ucnv_close(conv);
552
553 // ***************************** END SAMPLE ********************
554
555 // Print it out
556 printBytes("src", source, strlen(source) );
557 printf("\n");
558 printUChars("targ", target, len);
559
560 return U_ZERO_ERROR;
561 }
562
563 /******************************************************************
564 C: Convert from codepage to Unicode one at a time.
565 */
566
convsample_13()567 UErrorCode convsample_13()
568 {
569 printf("\n\n==============================================\n"
570 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
571
572
573 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
574 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
575 const char *source, *sourceLimit;
576 UChar32 target;
577 UErrorCode status = U_ZERO_ERROR;
578 UConverter *conv = NULL;
579 int32_t srcCount=0;
580 int32_t dstCount=0;
581
582 srcCount = sizeof(sourceChars);
583
584 conv = ucnv_open("Big5", &status);
585 U_ASSERT(status);
586
587 source = sourceChars;
588 sourceLimit = sourceChars + sizeof(sourceChars);
589
590 // **************************** START SAMPLE *******************
591
592
593 printBytes("src",source,sourceLimit-source);
594
595 while(source < sourceLimit)
596 {
597 puts("");
598 target = ucnv_getNextUChar (conv,
599 &source,
600 sourceLimit,
601 &status);
602
603 // printBytes("src",source,sourceLimit-source);
604 U_ASSERT(status);
605 printUChar(target);
606 dstCount++;
607 }
608
609
610 // ************************** END SAMPLE *************************
611
612 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
613 ucnv_close(conv);
614
615 return U_ZERO_ERROR;
616 }
617
618
619
620
convsample_20_didSubstitute(const char * source)621 UBool convsample_20_didSubstitute(const char *source)
622 {
623 UChar uchars[100];
624 char bytes[100];
625 UConverter *conv = NULL;
626 UErrorCode status = U_ZERO_ERROR;
627 uint32_t len, len2;
628 UBool flagVal;
629
630 FromUFLAGContext * context = NULL;
631
632 printf("\n\n==============================================\n"
633 "Sample 20: C: Test for substitution using callbacks\n");
634
635 /* print out the original source */
636 printBytes("src", source);
637 printf("\n");
638
639 /* First, convert from UTF8 to unicode */
640 conv = ucnv_open("utf-8", &status);
641 U_ASSERT(status);
642
643 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
644 U_ASSERT(status);
645
646 printUChars("uch", uchars, len);
647 printf("\n");
648
649 /* Now, close the converter */
650 ucnv_close(conv);
651
652 /* Now, convert to windows-1252 */
653 conv = ucnv_open("windows-1252", &status);
654 U_ASSERT(status);
655
656 /* Converter starts out with the SUBSTITUTE callback set. */
657
658 /* initialize our callback */
659 context = flagCB_fromU_openContext();
660
661 /* Set our special callback */
662 ucnv_setFromUCallBack(conv,
663 flagCB_fromU,
664 context,
665 &(context->subCallback),
666 &(context->subContext),
667 &status);
668
669 U_ASSERT(status);
670
671 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
672 U_ASSERT(status);
673
674 flagVal = context->flag; /* it's about to go away when we close the cnv */
675
676 ucnv_close(conv);
677
678 /* print out the original source */
679 printBytes("bytes", bytes, len2);
680
681 return flagVal; /* true if callback was called */
682 }
683
convsample_20()684 UErrorCode convsample_20()
685 {
686 const char *sample1 = "abc\xdf\xbf";
687 const char *sample2 = "abc_def";
688
689
690 if(convsample_20_didSubstitute(sample1))
691 {
692 printf("DID substitute.\n******\n");
693 }
694 else
695 {
696 printf("Did NOT substitute.\n*****\n");
697 }
698
699 if(convsample_20_didSubstitute(sample2))
700 {
701 printf("DID substitute.\n******\n");
702 }
703 else
704 {
705 printf("Did NOT substitute.\n*****\n");
706 }
707
708 return U_ZERO_ERROR;
709 }
710
711 // 21 - C, callback, with clone and debug
712
713
714
convsample_21_didSubstitute(const char * source)715 UBool convsample_21_didSubstitute(const char *source)
716 {
717 UChar uchars[100];
718 char bytes[100];
719 UConverter *conv = NULL, *cloneCnv = NULL;
720 UErrorCode status = U_ZERO_ERROR;
721 uint32_t len, len2;
722 int32_t cloneLen;
723 UBool flagVal = FALSE;
724 UConverterFromUCallback junkCB;
725
726 FromUFLAGContext *flagCtx = NULL,
727 *cloneFlagCtx = NULL;
728
729 debugCBContext *debugCtx1 = NULL,
730 *debugCtx2 = NULL,
731 *cloneDebugCtx = NULL;
732
733 printf("\n\n==============================================\n"
734 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
735
736 /* print out the original source */
737 printBytes("src", source);
738 printf("\n");
739
740 /* First, convert from UTF8 to unicode */
741 conv = ucnv_open("utf-8", &status);
742 U_ASSERT(status);
743
744 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
745 U_ASSERT(status);
746
747 printUChars("uch", uchars, len);
748 printf("\n");
749
750 /* Now, close the converter */
751 ucnv_close(conv);
752
753 /* Now, convert to windows-1252 */
754 conv = ucnv_open("windows-1252", &status);
755 U_ASSERT(status);
756
757 /* Converter starts out with the SUBSTITUTE callback set. */
758
759 /* initialize our callback */
760 /* from the 'bottom' innermost, out
761 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
762
763 #if DEBUG_TMI
764 printf("flagCB_fromU = %p\n", &flagCB_fromU);
765 printf("debugCB_fromU = %p\n", &debugCB_fromU);
766 #endif
767
768 debugCtx1 = debugCB_openContext();
769 flagCtx = flagCB_fromU_openContext();
770 debugCtx2 = debugCB_openContext();
771
772 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
773 debugCtx1->subContext = flagCtx;
774
775 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
776 flagCtx->subContext = debugCtx2;
777
778 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
779 debugCtx2->subContext = NULL;
780
781 /* Set our special callback */
782
783 ucnv_setFromUCallBack(conv,
784 debugCB_fromU,
785 debugCtx1,
786 &(debugCtx2->subCallback),
787 &(debugCtx2->subContext),
788 &status);
789
790 U_ASSERT(status);
791
792 #if DEBUG_TMI
793 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
794 conv, debugCtx1, debugCtx1->subCallback,
795 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
796 #endif
797
798 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
799
800 U_ASSERT(status);
801
802 #if DEBUG_TMI
803 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
804 #endif
805
806 ucnv_close(conv);
807
808 #if DEBUG_TMI
809 printf("%p closed.\n", conv);
810 #endif
811
812 U_ASSERT(status);
813 /* Now, we have to extract the context */
814 cloneDebugCtx = NULL;
815 cloneFlagCtx = NULL;
816
817 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
818 if(cloneDebugCtx != NULL) {
819 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
820 }
821
822 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
823 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
824
825 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
826 U_ASSERT(status);
827
828 if(cloneFlagCtx != NULL) {
829 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
830 } else {
831 printf("** Warning, couldn't get the subcallback \n");
832 }
833
834 ucnv_close(cloneCnv);
835
836 /* print out the original source */
837 printBytes("bytes", bytes, len2);
838
839 return flagVal; /* true if callback was called */
840 }
841
convsample_21()842 UErrorCode convsample_21()
843 {
844 const char *sample1 = "abc\xdf\xbf";
845 const char *sample2 = "abc_def";
846
847 if(convsample_21_didSubstitute(sample1))
848 {
849 printf("DID substitute.\n******\n");
850 }
851 else
852 {
853 printf("Did NOT substitute.\n*****\n");
854 }
855
856 if(convsample_21_didSubstitute(sample2))
857 {
858 printf("DID substitute.\n******\n");
859 }
860 else
861 {
862 printf("Did NOT substitute.\n*****\n");
863 }
864
865 return U_ZERO_ERROR;
866 }
867
868
869 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
870
871 #define BUFFERSIZE 17 /* make it interesting :) */
872
convsample_40()873 UErrorCode convsample_40()
874 {
875 printf("\n\n==============================================\n"
876 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
877
878 FILE *f;
879 FILE *out;
880 int32_t count;
881 char inBuf[BUFFERSIZE];
882 const char *source;
883 const char *sourceLimit;
884 UChar *uBuf;
885 UChar *target;
886 UChar *targetLimit;
887 int32_t uBufSize = 0;
888 UConverter *conv = NULL;
889 UErrorCode status = U_ZERO_ERROR;
890 uint32_t inbytes=0, total=0;
891
892 f = fopen("data02.bin", "rb");
893 if(!f)
894 {
895 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
896 return U_FILE_ACCESS_ERROR;
897 }
898
899 out = fopen("data40.utf16", "wb");
900 if(!out)
901 {
902 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
903 fclose(f);
904 return U_FILE_ACCESS_ERROR;
905 }
906
907 // **************************** START SAMPLE *******************
908 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
909 assert(U_SUCCESS(status));
910
911 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
912 printf("input bytes %d / min chars %d = %d UChars\n",
913 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
914 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
915 assert(uBuf!=NULL);
916
917 // grab another buffer's worth
918 while((!feof(f)) &&
919 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
920 {
921 inbytes += count;
922
923 // Convert bytes to unicode
924 source = inBuf;
925 sourceLimit = inBuf + count;
926
927 do
928 {
929 target = uBuf;
930 targetLimit = uBuf + uBufSize;
931
932 ucnv_toUnicode( conv, &target, targetLimit,
933 &source, sourceLimit, NULL,
934 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
935 /* is true (when no more data will come) */
936 &status);
937
938 if(status == U_BUFFER_OVERFLOW_ERROR)
939 {
940 // simply ran out of space - we'll reset the target ptr the next
941 // time through the loop.
942 status = U_ZERO_ERROR;
943 }
944 else
945 {
946 // Check other errors here.
947 assert(U_SUCCESS(status));
948 // Break out of the loop (by force)
949 }
950
951 // Process the Unicode
952 // Todo: handle UTF-16/surrogates
953 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
954 (size_t)(target-uBuf));
955 total += (target-uBuf);
956 } while (source < sourceLimit); // while simply out of space
957 }
958
959 printf("%d bytes in, %d UChars out.\n", inbytes, total);
960
961 // ***************************** END SAMPLE ********************
962 ucnv_close(conv);
963
964 fclose(f);
965 fclose(out);
966 printf("\n");
967
968 return U_ZERO_ERROR;
969 }
970 #undef BUFFERSIZE
971
972
973
974 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
975
976 #define BUFFERSIZE 24 /* make it interesting :) */
977
convsample_46()978 UErrorCode convsample_46()
979 {
980 printf("\n\n==============================================\n"
981 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
982
983 FILE *f;
984 FILE *out;
985 int32_t count;
986 UChar inBuf[BUFFERSIZE];
987 const UChar *source;
988 const UChar *sourceLimit;
989 char *buf;
990 char *target;
991 char *targetLimit;
992
993 int32_t bufSize = 0;
994 UConverter *conv = NULL;
995 UErrorCode status = U_ZERO_ERROR;
996 uint32_t inchars=0, total=0;
997
998 f = fopen("data40.utf16", "rb");
999 if(!f)
1000 {
1001 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1002 return U_FILE_ACCESS_ERROR;
1003 }
1004
1005 out = fopen("data46.out", "wb");
1006 if(!out)
1007 {
1008 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1009 fclose(f);
1010 return U_FILE_ACCESS_ERROR;
1011 }
1012
1013 // **************************** START SAMPLE *******************
1014 conv = ucnv_open( "iso-8859-2", &status);
1015 assert(U_SUCCESS(status));
1016
1017 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1018 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1019 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1020 buf = (char*)malloc(bufSize * sizeof(char));
1021 assert(buf!=NULL);
1022
1023 // grab another buffer's worth
1024 while((!feof(f)) &&
1025 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1026 {
1027 inchars += count;
1028
1029 // Convert bytes to unicode
1030 source = inBuf;
1031 sourceLimit = inBuf + count;
1032
1033 do
1034 {
1035 target = buf;
1036 targetLimit = buf + bufSize;
1037
1038 ucnv_fromUnicode( conv, &target, targetLimit,
1039 &source, sourceLimit, NULL,
1040 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
1041 /* is true (when no more data will come) */
1042 &status);
1043
1044 if(status == U_BUFFER_OVERFLOW_ERROR)
1045 {
1046 // simply ran out of space - we'll reset the target ptr the next
1047 // time through the loop.
1048 status = U_ZERO_ERROR;
1049 }
1050 else
1051 {
1052 // Check other errors here.
1053 assert(U_SUCCESS(status));
1054 // Break out of the loop (by force)
1055 }
1056
1057 // Process the Unicode
1058 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1059 (size_t)(target-buf));
1060 total += (target-buf);
1061 } while (source < sourceLimit); // while simply out of space
1062 }
1063
1064 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1065
1066 // ***************************** END SAMPLE ********************
1067 ucnv_close(conv);
1068
1069 fclose(f);
1070 fclose(out);
1071 printf("\n");
1072
1073 return U_ZERO_ERROR;
1074 }
1075 #undef BUFFERSIZE
1076
1077 #define BUFFERSIZE 219
1078
convsample_50()1079 void convsample_50() {
1080 printf("\n\n==============================================\n"
1081 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1082
1083 //! [ucnv_detectUnicodeSignature]
1084 UErrorCode err = U_ZERO_ERROR;
1085 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1086 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1087 int32_t signatureLength = 0;
1088 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1089 UConverter *conv = NULL;
1090 UChar output[100];
1091 UChar *target = output, *out;
1092 const char *source = input;
1093 if(encoding!=NULL && U_SUCCESS(err)){
1094 // should signature be discarded ?
1095 conv = ucnv_open(encoding, &err);
1096 // do the conversion
1097 ucnv_toUnicode(conv,
1098 &target, output + UPRV_LENGTHOF(output),
1099 &source, input + sizeof(input),
1100 NULL, TRUE, &err);
1101 out = output;
1102 if (discardSignature){
1103 ++out; // ignore initial U+FEFF
1104 }
1105 while(out != target) {
1106 printf("%04x ", *out++);
1107 }
1108 puts("");
1109 }
1110 //! [ucnv_detectUnicodeSignature]
1111 puts("");
1112 }
1113
1114
1115
1116 /* main */
1117
main()1118 int main()
1119 {
1120
1121 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1122
1123 convsample_02(); // C , u->koi8r, conv
1124 convsample_03(); // C, iterate
1125
1126 convsample_05(); // C, utf8->u, getNextUChar
1127 convsample_06(); // C freq counter thingy
1128
1129 convsample_12(); // C, sjis->u, conv
1130 convsample_13(); // C, big5->u, getNextU
1131
1132 convsample_20(); // C, callback
1133 convsample_21(); // C, callback debug
1134
1135 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1136
1137 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1138
1139 convsample_50(); // C, detect unicode signature
1140
1141 printf("End of converter samples.\n");
1142
1143 fflush(stdout);
1144 fflush(stderr);
1145
1146 return 0;
1147 }
1148