1 /*************************************************************************
2 *
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 * Copyright (C) 2000-2016, International Business Machines
10 * Corporation and others. All Rights Reserved.
11 *
12 ***************************************************************************
13 * file name: convsamp.c
14 * encoding: ASCII (7-bit)
15 *
16 * created on: 2000may30
17 * created by: Steven R. Loomis
18 *
19 * Sample code for the ICU conversion routines.
20 *
21 * Note: Nothing special is needed to build this sample. Link with
22 * the icu UC and icu I18N libraries.
23 *
24 * I use 'assert' for error checking, you probably will want
25 * something more flexible. '***BEGIN SAMPLE***' and
26 * '***END SAMPLE***' mark pieces suitable for stand alone
27 * code snippets.
28 *
29 *
30 * Each test can define it's own BUFFERSIZE
31 *
32 */
33
34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
35
36 #include <stdio.h>
37 #include <ctype.h> /* for isspace, etc. */
38 #include <assert.h>
39 #include <string.h>
40 #include <stdlib.h> /* malloc */
41
42 #include "unicode/utypes.h" /* Basic ICU data types */
43 #include "unicode/ucnv.h" /* C Converter API */
44 #include "unicode/ustring.h" /* some more string fcns*/
45 #include "unicode/uchar.h" /* char names */
46 #include "unicode/uloc.h"
47 #include "unicode/unistr.h"
48
49 #include "flagcb.h"
50
51 /* Some utility functions */
52 #ifndef UPRV_LENGTHOF
53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
54 #endif
55
56 static const UChar kNone[] = { 0x0000 };
57
58 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
59
60 /* Print a UChar if possible, in seven characters. */
prettyPrintUChar(UChar c)61 void prettyPrintUChar(UChar c)
62 {
63 if( (c <= 0x007F) &&
64 (isgraph(c)) ) {
65 printf(" '%c' ", (char)(0x00FF&c));
66 } else if ( c > 0x007F ) {
67 char buf[1000];
68 UErrorCode status = U_ZERO_ERROR;
69 int32_t o;
70
71 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
72 if(U_SUCCESS(status) && (o>0) ) {
73 buf[6] = 0;
74 printf("%7s", buf);
75 } else {
76 printf(" ??????");
77 }
78 } else {
79 switch((char)(c & 0x007F)) {
80 case ' ':
81 printf(" ' ' ");
82 break;
83 case '\t':
84 printf(" \\t ");
85 break;
86 case '\n':
87 printf(" \\n ");
88 break;
89 default:
90 printf(" _ ");
91 break;
92 }
93 }
94 }
95
96
printUChars(const char * name="?",const UChar * uch=kNone,int32_t len=-1)97 void printUChars(const char *name = "?",
98 const UChar *uch = kNone,
99 int32_t len = -1 )
100 {
101 int32_t i;
102
103 if( (len == -1) && (uch) ) {
104 len = u_strlen(uch);
105 }
106
107 printf("%5s: ", name);
108 for( i = 0; i <len; i++) {
109 printf("%-6d ", i);
110 }
111 printf("\n");
112
113 printf("%5s: ", "uni");
114 for( i = 0; i <len; i++) {
115 printf("\\u%04X ", (int)uch[i]);
116 }
117 printf("\n");
118
119 printf("%5s:", "ch");
120 for( i = 0; i <len; i++) {
121 prettyPrintUChar(uch[i]);
122 }
123 printf("\n");
124 }
125
printBytes(const char * name="?",const char * uch="",int32_t len=-1)126 void printBytes(const char *name = "?",
127 const char *uch = "",
128 int32_t len = -1 )
129 {
130 int32_t i;
131
132 if( (len == -1) && (uch) ) {
133 len = static_cast<int32_t>(strlen(uch));
134 }
135
136 printf("%5s: ", name);
137 for( i = 0; i <len; i++) {
138 printf("%-4d ", i);
139 }
140 printf("\n");
141
142 printf("%5s: ", "uni");
143 for( i = 0; i <len; i++) {
144 printf("\\x%02X ", 0x00FF & (int)uch[i]);
145 }
146 printf("\n");
147
148 printf("%5s:", "ch");
149 for( i = 0; i <len; i++) {
150 if(isgraph(0x00FF & (int)uch[i])) {
151 printf(" '%c' ", (char)uch[i]);
152 } else {
153 printf(" ");
154 }
155 }
156 printf("\n");
157 }
158
printUChar(UChar32 ch32)159 void printUChar(UChar32 ch32)
160 {
161 if(ch32 > 0xFFFF) {
162 printf("ch: U+%06X\n", ch32);
163 }
164 else {
165 UChar ch = (UChar)ch32;
166 printUChars("C", &ch, 1);
167 }
168 }
169
170 /*******************************************************************
171 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
172 followed by an exclamation mark (!) into the KOI8-R Russian code page.
173
174 This example first creates a UChar String out of the Unicode chars.
175
176 targetSize must be set to the amount of space available in the target
177 buffer. After fromUChars is called,
178 len will contain the number of bytes in target[] which were
179 used in the resulting codepage. In this case, there is a 1:1 mapping
180 between the input and output characters. The exclamation mark has the
181 same value in both KOI8-R and Unicode.
182
183 src: 0 1 2 3 4 5 6
184 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
185 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
186
187 targ: 0 1 2 3 4 5 6
188 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
189 ch: '!'
190
191
192 Converting FROM unicode
193 to koi8-r.
194 You must call ucnv_close to clean up the memory used by the
195 converter.
196
197 'len' returns the number of OUTPUT bytes resulting from the
198 conversion.
199 */
200
convsample_02()201 UErrorCode convsample_02()
202 {
203 printf("\n\n==============================================\n"
204 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
205
206
207 // **************************** START SAMPLE *******************
208 // "cat<cat>OK"
209 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
210 0x0430, 0x0021, 0x0000 };
211 char target[100];
212 UErrorCode status = U_ZERO_ERROR;
213 UConverter *conv;
214 int32_t len;
215
216 // set up the converter
217 //! [ucnv_open]
218 conv = ucnv_open("koi8-r", &status);
219 //! [ucnv_open]
220 assert(U_SUCCESS(status));
221
222 // convert to koi8-r
223 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
224 assert(U_SUCCESS(status));
225
226 // close the converter
227 ucnv_close(conv);
228
229 // ***************************** END SAMPLE ********************
230
231 // Print it out
232 printUChars("src", source);
233 printf("\n");
234 printBytes("targ", target, len);
235
236 return U_ZERO_ERROR;
237 }
238
239
convsample_03()240 UErrorCode convsample_03()
241 {
242 printf("\n\n==============================================\n"
243 "Sample 03: C: print out all converters\n");
244
245 int32_t count;
246 int32_t i;
247
248 // **************************** START SAMPLE *******************
249 count = ucnv_countAvailable();
250 printf("Available converters: %d\n", count);
251
252 for(i=0;i<count;i++)
253 {
254 printf("%s ", ucnv_getAvailableName(i));
255 }
256
257 // ***************************** END SAMPLE ********************
258
259 printf("\n");
260
261 return U_ZERO_ERROR;
262 }
263
264
265
266 #define BUFFERSIZE 17 /* make it interesting :) */
267
268 /*
269 Converting from a codepage to Unicode in bulk..
270 What is the best way to determine the buffer size?
271
272 The 'buffersize' is in bytes of input.
273 For a given converter, divinding this by the minimum char size
274 give you the maximum number of Unicode characters that could be
275 expected for a given number of input bytes.
276 see: ucnv_getMinCharSize()
277
278 For example, a single byte codepage like 'Latin-3' has a
279 minimum char size of 1. (It takes at least 1 byte to represent
280 each Unicode char.) So the unicode buffer has the same number of
281 UChars as the input buffer has bytes.
282
283 In a strictly double byte codepage such as cp1362 (Windows
284 Korean), the minimum char size is 2. So, only half as many Unicode
285 chars as bytes are needed.
286
287 This work to calculate the buffer size is an optimization. Any
288 size of input and output buffer can be used, as long as the
289 program handles the following cases: If the input buffer is empty,
290 the source pointer will be equal to sourceLimit. If the output
291 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
292 */
293
convsample_05()294 UErrorCode convsample_05()
295 {
296 printf("\n\n==============================================\n"
297 "Sample 05: C: count the number of letters in a UTF-8 document\n");
298
299 FILE *f;
300 int32_t count;
301 char inBuf[BUFFERSIZE];
302 const char *source;
303 const char *sourceLimit;
304 UChar *uBuf;
305 UChar *target;
306 UChar *targetLimit;
307 UChar *p;
308 int32_t uBufSize = 0;
309 UConverter *conv;
310 UErrorCode status = U_ZERO_ERROR;
311 uint32_t letters=0, total=0;
312
313 f = fopen("data01.txt", "r");
314 if(!f)
315 {
316 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
317 return U_FILE_ACCESS_ERROR;
318 }
319
320 // **************************** START SAMPLE *******************
321 conv = ucnv_open("utf-8", &status);
322 assert(U_SUCCESS(status));
323
324 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
325 printf("input bytes %d / min chars %d = %d UChars\n",
326 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
327 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
328 assert(uBuf!=NULL);
329
330 // grab another buffer's worth
331 while((!feof(f)) &&
332 ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) )
333 {
334 // Convert bytes to unicode
335 source = inBuf;
336 sourceLimit = inBuf + count;
337
338 do
339 {
340 target = uBuf;
341 targetLimit = uBuf + uBufSize;
342
343 ucnv_toUnicode(conv, &target, targetLimit,
344 &source, sourceLimit, NULL,
345 feof(f)?true:false, /* pass 'flush' when eof */
346 /* is true (when no more data will come) */
347 &status);
348
349 if(status == U_BUFFER_OVERFLOW_ERROR)
350 {
351 // simply ran out of space - we'll reset the target ptr the next
352 // time through the loop.
353 status = U_ZERO_ERROR;
354 }
355 else
356 {
357 // Check other errors here.
358 assert(U_SUCCESS(status));
359 // Break out of the loop (by force)
360 }
361
362 // Process the Unicode
363 // Todo: handle UTF-16/surrogates
364
365 for(p = uBuf; p<target; p++)
366 {
367 if(u_isalpha(*p))
368 letters++;
369 total++;
370 }
371 } while (source < sourceLimit); // while simply out of space
372 }
373
374 printf("%d letters out of %d total UChars.\n", letters, total);
375
376 // ***************************** END SAMPLE ********************
377 ucnv_close(conv);
378
379 printf("\n");
380
381 fclose(f);
382
383 return U_ZERO_ERROR;
384 }
385 #undef BUFFERSIZE
386
387 #define BUFFERSIZE 1024
388 typedef struct
389 {
390 UChar32 codepoint;
391 uint32_t frequency;
392 } CharFreqInfo;
393
convsample_06()394 UErrorCode convsample_06()
395 {
396 printf("\n\n==============================================\n"
397 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
398
399 FILE *f;
400 int32_t count;
401 char inBuf[BUFFERSIZE];
402 const char *source;
403 const char *sourceLimit;
404 int32_t uBufSize = 0;
405 UConverter *conv;
406 UErrorCode status = U_ZERO_ERROR;
407 uint32_t letters=0, total=0;
408
409 CharFreqInfo *info;
410 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
411 UChar32 p;
412
413 uint32_t ie = 0;
414 uint32_t gh = 0;
415 UChar32 l = 0;
416
417 f = fopen("data06.txt", "r");
418 if(!f)
419 {
420 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
421 return U_FILE_ACCESS_ERROR;
422 }
423
424 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
425 if(!info)
426 {
427 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", static_cast<int>(sizeof(CharFreqInfo)*charCount));
428 }
429
430 /* reset frequencies */
431 for(p=0;p<charCount;p++)
432 {
433 info[p].codepoint = p;
434 info[p].frequency = 0;
435 }
436
437 // **************************** START SAMPLE *******************
438 conv = ucnv_open("utf-8", &status);
439 assert(U_SUCCESS(status));
440
441 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
442 printf("input bytes %d / min chars %d = %d UChars\n",
443 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
444
445 // grab another buffer's worth
446 while((!feof(f)) &&
447 ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) )
448 {
449 // Convert bytes to unicode
450 source = inBuf;
451 sourceLimit = inBuf + count;
452
453 while(source < sourceLimit)
454 {
455 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
456 if(U_FAILURE(status))
457 {
458 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
459 status = U_ZERO_ERROR;
460 continue;
461 }
462 U_ASSERT(status);
463 total++;
464
465 if(u_isalpha(p))
466 letters++;
467
468 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
469 ie++;
470
471 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
472 gh++;
473
474 if(p>charCount)
475 {
476 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
477 free(info);
478 fclose(f);
479 ucnv_close(conv);
480 return U_UNSUPPORTED_ERROR;
481 }
482 info[p].frequency++;
483 l = p;
484 }
485 }
486
487 fclose(f);
488 ucnv_close(conv);
489
490 printf("%d letters out of %d total UChars.\n", letters, total);
491 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
492
493 // now, we could sort it..
494
495 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
496
497 for(p=0;p<charCount;p++)
498 {
499 if(info[p].frequency)
500 {
501 printf("% 5d U+%06X ", info[p].frequency, p);
502 if(p <= 0xFFFF)
503 {
504 prettyPrintUChar((UChar)p);
505 }
506 printf("\n");
507 }
508 }
509 free(info);
510 // ***************************** END SAMPLE ********************
511
512 printf("\n");
513
514 return U_ZERO_ERROR;
515 }
516 #undef BUFFERSIZE
517
518
519 /******************************************************
520 You must call ucnv_close to clean up the memory used by the
521 converter.
522
523 'len' returns the number of OUTPUT bytes resulting from the
524 conversion.
525 */
526
convsample_12()527 UErrorCode convsample_12()
528 {
529 printf("\n\n==============================================\n"
530 "Sample 12: C: simple sjis -> unicode conversion\n");
531
532
533 // **************************** START SAMPLE *******************
534
535 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
536 UChar target[100];
537 UErrorCode status = U_ZERO_ERROR;
538 UConverter *conv;
539 int32_t len;
540
541 // set up the converter
542 conv = ucnv_open("shift_jis", &status);
543 assert(U_SUCCESS(status));
544
545 // convert to Unicode
546 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
547 target[6] = 0xFDCA;
548 len = ucnv_toUChars(conv, target, 100, source, static_cast<int32_t>(strlen(source)), &status);
549 U_ASSERT(status);
550 // close the converter
551 ucnv_close(conv);
552
553 // ***************************** END SAMPLE ********************
554
555 // Print it out
556 printBytes("src", source, static_cast<int32_t>(strlen(source)) );
557 printf("\n");
558 printUChars("targ", target, len);
559
560 return U_ZERO_ERROR;
561 }
562
563 /******************************************************************
564 C: Convert from codepage to Unicode one at a time.
565 */
566
convsample_13()567 UErrorCode convsample_13()
568 {
569 printf("\n\n==============================================\n"
570 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
571
572
573 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
574 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
575 const char *source, *sourceLimit;
576 UChar32 target;
577 UErrorCode status = U_ZERO_ERROR;
578 UConverter *conv = NULL;
579 int32_t srcCount=0;
580 int32_t dstCount=0;
581
582 srcCount = sizeof(sourceChars);
583
584 conv = ucnv_open("Big5", &status);
585 U_ASSERT(status);
586
587 source = sourceChars;
588 sourceLimit = sourceChars + sizeof(sourceChars);
589
590 // **************************** START SAMPLE *******************
591
592
593 printBytes("src", source, static_cast<int32_t>(sourceLimit - source));
594
595 while(source < sourceLimit)
596 {
597 puts("");
598 target = ucnv_getNextUChar (conv,
599 &source,
600 sourceLimit,
601 &status);
602
603 // printBytes("src",source,sourceLimit-source);
604 U_ASSERT(status);
605 printUChar(target);
606 dstCount++;
607 }
608
609
610 // ************************** END SAMPLE *************************
611
612 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
613 ucnv_close(conv);
614
615 return U_ZERO_ERROR;
616 }
617
618
619
620
convsample_20_didSubstitute(const char * source)621 UBool convsample_20_didSubstitute(const char *source)
622 {
623 UChar uchars[100];
624 char bytes[100];
625 UConverter *conv = NULL;
626 UErrorCode status = U_ZERO_ERROR;
627 uint32_t len, len2;
628 UBool flagVal;
629
630 FromUFLAGContext * context = NULL;
631
632 printf("\n\n==============================================\n"
633 "Sample 20: C: Test for substitution using callbacks\n");
634
635 /* print out the original source */
636 printBytes("src", source);
637 printf("\n");
638
639 /* First, convert from UTF8 to unicode */
640 conv = ucnv_open("utf-8", &status);
641 U_ASSERT(status);
642
643 len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status);
644 U_ASSERT(status);
645
646 printUChars("uch", uchars, len);
647 printf("\n");
648
649 /* Now, close the converter */
650 ucnv_close(conv);
651
652 /* Now, convert to windows-1252 */
653 conv = ucnv_open("windows-1252", &status);
654 U_ASSERT(status);
655
656 /* Converter starts out with the SUBSTITUTE callback set. */
657
658 /* initialize our callback */
659 context = flagCB_fromU_openContext();
660
661 /* Set our special callback */
662 ucnv_setFromUCallBack(conv,
663 flagCB_fromU,
664 context,
665 &(context->subCallback),
666 &(context->subContext),
667 &status);
668
669 U_ASSERT(status);
670
671 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
672 U_ASSERT(status);
673
674 flagVal = context->flag; /* it's about to go away when we close the cnv */
675
676 ucnv_close(conv);
677
678 /* print out the original source */
679 printBytes("bytes", bytes, len2);
680
681 return flagVal; /* true if callback was called */
682 }
683
convsample_20()684 UErrorCode convsample_20()
685 {
686 const char *sample1 = "abc\xdf\xbf";
687 const char *sample2 = "abc_def";
688
689
690 if(convsample_20_didSubstitute(sample1))
691 {
692 printf("DID substitute.\n******\n");
693 }
694 else
695 {
696 printf("Did NOT substitute.\n*****\n");
697 }
698
699 if(convsample_20_didSubstitute(sample2))
700 {
701 printf("DID substitute.\n******\n");
702 }
703 else
704 {
705 printf("Did NOT substitute.\n*****\n");
706 }
707
708 return U_ZERO_ERROR;
709 }
710
711 // 21 - C, callback, with clone and debug
712
713
714
convsample_21_didSubstitute(const char * source)715 UBool convsample_21_didSubstitute(const char *source)
716 {
717 UChar uchars[100];
718 char bytes[100];
719 UConverter *conv = NULL, *cloneCnv = NULL;
720 UErrorCode status = U_ZERO_ERROR;
721 uint32_t len, len2;
722 UBool flagVal = false;
723 UConverterFromUCallback junkCB;
724
725 FromUFLAGContext *flagCtx = NULL,
726 *cloneFlagCtx = NULL;
727
728 debugCBContext *debugCtx1 = NULL,
729 *debugCtx2 = NULL,
730 *cloneDebugCtx = NULL;
731
732 printf("\n\n==============================================\n"
733 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
734
735 /* print out the original source */
736 printBytes("src", source);
737 printf("\n");
738
739 /* First, convert from UTF8 to unicode */
740 conv = ucnv_open("utf-8", &status);
741 U_ASSERT(status);
742
743 len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status);
744 U_ASSERT(status);
745
746 printUChars("uch", uchars, len);
747 printf("\n");
748
749 /* Now, close the converter */
750 ucnv_close(conv);
751
752 /* Now, convert to windows-1252 */
753 conv = ucnv_open("windows-1252", &status);
754 U_ASSERT(status);
755
756 /* Converter starts out with the SUBSTITUTE callback set. */
757
758 /* initialize our callback */
759 /* from the 'bottom' innermost, out
760 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
761
762 #if DEBUG_TMI
763 printf("flagCB_fromU = %p\n", &flagCB_fromU);
764 printf("debugCB_fromU = %p\n", &debugCB_fromU);
765 #endif
766
767 debugCtx1 = debugCB_openContext();
768 flagCtx = flagCB_fromU_openContext();
769 debugCtx2 = debugCB_openContext();
770
771 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
772 debugCtx1->subContext = flagCtx;
773
774 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
775 flagCtx->subContext = debugCtx2;
776
777 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
778 debugCtx2->subContext = NULL;
779
780 /* Set our special callback */
781
782 ucnv_setFromUCallBack(conv,
783 debugCB_fromU,
784 debugCtx1,
785 &(debugCtx2->subCallback),
786 &(debugCtx2->subContext),
787 &status);
788
789 U_ASSERT(status);
790
791 #if DEBUG_TMI
792 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
793 conv, debugCtx1, debugCtx1->subCallback,
794 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
795 #endif
796
797 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
798
799 U_ASSERT(status);
800
801 #if DEBUG_TMI
802 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
803 #endif
804
805 ucnv_close(conv);
806
807 #if DEBUG_TMI
808 printf("%p closed.\n", conv);
809 #endif
810
811 U_ASSERT(status);
812 /* Now, we have to extract the context */
813 cloneDebugCtx = NULL;
814 cloneFlagCtx = NULL;
815
816 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
817 if(cloneDebugCtx != NULL) {
818 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
819 }
820
821 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
822 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
823
824 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
825 U_ASSERT(status);
826
827 if(cloneFlagCtx != NULL) {
828 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
829 } else {
830 printf("** Warning, couldn't get the subcallback \n");
831 }
832
833 ucnv_close(cloneCnv);
834
835 /* print out the original source */
836 printBytes("bytes", bytes, len2);
837
838 return flagVal; /* true if callback was called */
839 }
840
convsample_21()841 UErrorCode convsample_21()
842 {
843 const char *sample1 = "abc\xdf\xbf";
844 const char *sample2 = "abc_def";
845
846 if(convsample_21_didSubstitute(sample1))
847 {
848 printf("DID substitute.\n******\n");
849 }
850 else
851 {
852 printf("Did NOT substitute.\n*****\n");
853 }
854
855 if(convsample_21_didSubstitute(sample2))
856 {
857 printf("DID substitute.\n******\n");
858 }
859 else
860 {
861 printf("Did NOT substitute.\n*****\n");
862 }
863
864 return U_ZERO_ERROR;
865 }
866
867
868 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
869
870 #define BUFFERSIZE 17 /* make it interesting :) */
871
convsample_40()872 UErrorCode convsample_40()
873 {
874 printf("\n\n==============================================\n"
875 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
876
877 FILE *f;
878 FILE *out;
879 int32_t count;
880 char inBuf[BUFFERSIZE];
881 const char *source;
882 const char *sourceLimit;
883 UChar *uBuf;
884 UChar *target;
885 UChar *targetLimit;
886 int32_t uBufSize = 0;
887 UConverter *conv = NULL;
888 UErrorCode status = U_ZERO_ERROR;
889 uint32_t inbytes=0, total=0;
890
891 f = fopen("data02.bin", "rb");
892 if(!f)
893 {
894 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
895 return U_FILE_ACCESS_ERROR;
896 }
897
898 out = fopen("data40.utf16", "wb");
899 if(!out)
900 {
901 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
902 fclose(f);
903 return U_FILE_ACCESS_ERROR;
904 }
905
906 // **************************** START SAMPLE *******************
907 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
908 assert(U_SUCCESS(status));
909
910 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
911 printf("input bytes %d / min chars %d = %d UChars\n",
912 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
913 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
914 assert(uBuf!=NULL);
915
916 // grab another buffer's worth
917 while((!feof(f)) &&
918 ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) )
919 {
920 inbytes += count;
921
922 // Convert bytes to unicode
923 source = inBuf;
924 sourceLimit = inBuf + count;
925
926 do
927 {
928 target = uBuf;
929 targetLimit = uBuf + uBufSize;
930
931 ucnv_toUnicode( conv, &target, targetLimit,
932 &source, sourceLimit, NULL,
933 feof(f)?true:false, /* pass 'flush' when eof */
934 /* is true (when no more data will come) */
935 &status);
936
937 if(status == U_BUFFER_OVERFLOW_ERROR)
938 {
939 // simply ran out of space - we'll reset the target ptr the next
940 // time through the loop.
941 status = U_ZERO_ERROR;
942 }
943 else
944 {
945 // Check other errors here.
946 assert(U_SUCCESS(status));
947 // Break out of the loop (by force)
948 }
949
950 // Process the Unicode
951 // Todo: handle UTF-16/surrogates
952 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == (size_t)(target-uBuf));
953 total += static_cast<uint32_t>((target-uBuf));
954 } while (source < sourceLimit); // while simply out of space
955 }
956
957 printf("%d bytes in, %d UChars out.\n", inbytes, total);
958
959 // ***************************** END SAMPLE ********************
960 ucnv_close(conv);
961
962 fclose(f);
963 fclose(out);
964 printf("\n");
965
966 return U_ZERO_ERROR;
967 }
968 #undef BUFFERSIZE
969
970
971
972 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
973
974 #define BUFFERSIZE 24 /* make it interesting :) */
975
convsample_46()976 UErrorCode convsample_46()
977 {
978 printf("\n\n==============================================\n"
979 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
980
981 FILE *f;
982 FILE *out;
983 int32_t count;
984 UChar inBuf[BUFFERSIZE];
985 const UChar *source;
986 const UChar *sourceLimit;
987 char *buf;
988 char *target;
989 char *targetLimit;
990
991 int32_t bufSize = 0;
992 UConverter *conv = NULL;
993 UErrorCode status = U_ZERO_ERROR;
994 uint32_t inchars=0, total=0;
995
996 f = fopen("data40.utf16", "rb");
997 if(!f)
998 {
999 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1000 return U_FILE_ACCESS_ERROR;
1001 }
1002
1003 out = fopen("data46.out", "wb");
1004 if(!out)
1005 {
1006 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1007 fclose(f);
1008 return U_FILE_ACCESS_ERROR;
1009 }
1010
1011 // **************************** START SAMPLE *******************
1012 conv = ucnv_open( "iso-8859-2", &status);
1013 assert(U_SUCCESS(status));
1014
1015 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1016 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1017 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1018 buf = (char*)malloc(bufSize * sizeof(char));
1019 assert(buf!=NULL);
1020
1021 // grab another buffer's worth
1022 while((!feof(f)) &&
1023 ((count=static_cast<int32_t>(fread(inBuf, sizeof(UChar), BUFFERSIZE , f))) > 0) )
1024 {
1025 inchars += count;
1026
1027 // Convert bytes to unicode
1028 source = inBuf;
1029 sourceLimit = inBuf + count;
1030
1031 do
1032 {
1033 target = buf;
1034 targetLimit = buf + bufSize;
1035
1036 ucnv_fromUnicode( conv, &target, targetLimit,
1037 &source, sourceLimit, NULL,
1038 feof(f)?true:false, /* pass 'flush' when eof */
1039 /* is true (when no more data will come) */
1040 &status);
1041
1042 if(status == U_BUFFER_OVERFLOW_ERROR)
1043 {
1044 // simply ran out of space - we'll reset the target ptr the next
1045 // time through the loop.
1046 status = U_ZERO_ERROR;
1047 }
1048 else
1049 {
1050 // Check other errors here.
1051 assert(U_SUCCESS(status));
1052 // Break out of the loop (by force)
1053 }
1054
1055 // Process the Unicode
1056 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == (size_t)(target-buf));
1057 total += static_cast<uint32_t>((target-buf));
1058 } while (source < sourceLimit); // while simply out of space
1059 }
1060
1061 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, static_cast<int>(inchars * sizeof(UChar)), total);
1062
1063 // ***************************** END SAMPLE ********************
1064 ucnv_close(conv);
1065
1066 fclose(f);
1067 fclose(out);
1068 printf("\n");
1069
1070 return U_ZERO_ERROR;
1071 }
1072 #undef BUFFERSIZE
1073
1074 #define BUFFERSIZE 219
1075
convsample_50()1076 void convsample_50() {
1077 printf("\n\n==============================================\n"
1078 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1079
1080 //! [ucnv_detectUnicodeSignature]
1081 UErrorCode err = U_ZERO_ERROR;
1082 UBool discardSignature = true; /* set to true to throw away the initial U+FEFF */
1083 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1084 int32_t signatureLength = 0;
1085 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1086 UConverter *conv = NULL;
1087 UChar output[100];
1088 UChar *target = output, *out;
1089 const char *source = input;
1090 if(encoding!=NULL && U_SUCCESS(err)){
1091 // should signature be discarded ?
1092 conv = ucnv_open(encoding, &err);
1093 // do the conversion
1094 ucnv_toUnicode(conv,
1095 &target, output + UPRV_LENGTHOF(output),
1096 &source, input + sizeof(input),
1097 NULL, true, &err);
1098 out = output;
1099 if (discardSignature){
1100 ++out; // ignore initial U+FEFF
1101 }
1102 while(out != target) {
1103 printf("%04x ", *out++);
1104 }
1105 puts("");
1106 }
1107 //! [ucnv_detectUnicodeSignature]
1108 puts("");
1109 }
1110
1111
1112
1113 /* main */
1114
main()1115 int main()
1116 {
1117
1118 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1119
1120 convsample_02(); // C , u->koi8r, conv
1121 convsample_03(); // C, iterate
1122
1123 convsample_05(); // C, utf8->u, getNextUChar
1124 convsample_06(); // C freq counter thingy
1125
1126 convsample_12(); // C, sjis->u, conv
1127 convsample_13(); // C, big5->u, getNextU
1128
1129 convsample_20(); // C, callback
1130 convsample_21(); // C, callback debug
1131
1132 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1133
1134 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1135
1136 convsample_50(); // C, detect unicode signature
1137
1138 printf("End of converter samples.\n");
1139
1140 fflush(stdout);
1141 fflush(stderr);
1142
1143 return 0;
1144 }
1145