1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25
26 #define MAXIMUM_UCS2 0x0000FFFF
27 #define MAXIMUM_UTF 0x0010FFFF
28 #define HALF_SHIFT 10
29 #define HALF_BASE 0x0010000
30 #define HALF_MASK 0x3FF
31 #define SURROGATE_HIGH_START 0xD800
32 #define SURROGATE_LOW_START 0xDC00
33
34 /* -SURROGATE_LOW_START + HALF_BASE */
35 #define SURROGATE_LOW_BASE 9216
36
37 enum {
38 UCNV_NEED_TO_WRITE_BOM=1
39 };
40
41 /* UTF-32BE ----------------------------------------------------------------- */
42
43 static void
T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
45 UErrorCode * err)
46 {
47 const unsigned char *mySource = (unsigned char *) args->source;
48 UChar *myTarget = args->target;
49 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
50 const UChar *targetLimit = args->targetLimit;
51 unsigned char *toUBytes = args->converter->toUBytes;
52 uint32_t ch, i;
53
54 /* Restore state of current sequence */
55 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
56 i = args->converter->toULength; /* restore # of bytes consumed */
57 args->converter->toULength = 0;
58
59 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
60 args->converter->toUnicodeStatus = 0;
61 goto morebytes;
62 }
63
64 while (mySource < sourceLimit && myTarget < targetLimit) {
65 i = 0;
66 ch = 0;
67 morebytes:
68 while (i < sizeof(uint32_t)) {
69 if (mySource < sourceLimit) {
70 ch = (ch << 8) | (uint8_t)(*mySource);
71 toUBytes[i++] = (char) *(mySource++);
72 }
73 else {
74 /* stores a partially calculated target*/
75 /* + 1 to make 0 a valid character */
76 args->converter->toUnicodeStatus = ch + 1;
77 args->converter->toULength = (int8_t) i;
78 goto donefornow;
79 }
80 }
81
82 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
83 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
84 if (ch <= MAXIMUM_UCS2)
85 {
86 /* fits in 16 bits */
87 *(myTarget++) = (UChar) ch;
88 }
89 else {
90 /* write out the surrogates */
91 *(myTarget++) = U16_LEAD(ch);
92 ch = U16_TRAIL(ch);
93 if (myTarget < targetLimit) {
94 *(myTarget++) = (UChar)ch;
95 }
96 else {
97 /* Put in overflow buffer (not handled here) */
98 args->converter->UCharErrorBuffer[0] = (UChar) ch;
99 args->converter->UCharErrorBufferLength = 1;
100 *err = U_BUFFER_OVERFLOW_ERROR;
101 break;
102 }
103 }
104 }
105 else {
106 args->converter->toULength = (int8_t)i;
107 *err = U_ILLEGAL_CHAR_FOUND;
108 break;
109 }
110 }
111
112 donefornow:
113 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
114 /* End of target buffer */
115 *err = U_BUFFER_OVERFLOW_ERROR;
116 }
117
118 args->target = myTarget;
119 args->source = (const char *) mySource;
120 }
121
122 static void
T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
124 UErrorCode * err)
125 {
126 const unsigned char *mySource = (unsigned char *) args->source;
127 UChar *myTarget = args->target;
128 int32_t *myOffsets = args->offsets;
129 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
130 const UChar *targetLimit = args->targetLimit;
131 unsigned char *toUBytes = args->converter->toUBytes;
132 uint32_t ch, i;
133 int32_t offsetNum = 0;
134
135 /* Restore state of current sequence */
136 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
137 i = args->converter->toULength; /* restore # of bytes consumed */
138 args->converter->toULength = 0;
139
140 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
141 args->converter->toUnicodeStatus = 0;
142 goto morebytes;
143 }
144
145 while (mySource < sourceLimit && myTarget < targetLimit) {
146 i = 0;
147 ch = 0;
148 morebytes:
149 while (i < sizeof(uint32_t)) {
150 if (mySource < sourceLimit) {
151 ch = (ch << 8) | (uint8_t)(*mySource);
152 toUBytes[i++] = (char) *(mySource++);
153 }
154 else {
155 /* stores a partially calculated target*/
156 /* + 1 to make 0 a valid character */
157 args->converter->toUnicodeStatus = ch + 1;
158 args->converter->toULength = (int8_t) i;
159 goto donefornow;
160 }
161 }
162
163 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
164 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
165 if (ch <= MAXIMUM_UCS2) {
166 /* fits in 16 bits */
167 *(myTarget++) = (UChar) ch;
168 *(myOffsets++) = offsetNum;
169 }
170 else {
171 /* write out the surrogates */
172 *(myTarget++) = U16_LEAD(ch);
173 *myOffsets++ = offsetNum;
174 ch = U16_TRAIL(ch);
175 if (myTarget < targetLimit)
176 {
177 *(myTarget++) = (UChar)ch;
178 *(myOffsets++) = offsetNum;
179 }
180 else {
181 /* Put in overflow buffer (not handled here) */
182 args->converter->UCharErrorBuffer[0] = (UChar) ch;
183 args->converter->UCharErrorBufferLength = 1;
184 *err = U_BUFFER_OVERFLOW_ERROR;
185 break;
186 }
187 }
188 }
189 else {
190 args->converter->toULength = (int8_t)i;
191 *err = U_ILLEGAL_CHAR_FOUND;
192 break;
193 }
194 offsetNum += i;
195 }
196
197 donefornow:
198 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
199 {
200 /* End of target buffer */
201 *err = U_BUFFER_OVERFLOW_ERROR;
202 }
203
204 args->target = myTarget;
205 args->source = (const char *) mySource;
206 args->offsets = myOffsets;
207 }
208
209 static void
T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,UErrorCode * err)210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
211 UErrorCode * err)
212 {
213 const UChar *mySource = args->source;
214 unsigned char *myTarget;
215 const UChar *sourceLimit = args->sourceLimit;
216 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
217 UChar32 ch, ch2;
218 unsigned int indexToWrite;
219 unsigned char temp[sizeof(uint32_t)];
220
221 if(mySource >= sourceLimit) {
222 /* no input, nothing to do */
223 return;
224 }
225
226 /* write the BOM if necessary */
227 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
228 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
229 ucnv_fromUWriteBytes(args->converter,
230 bom, 4,
231 &args->target, args->targetLimit,
232 &args->offsets, -1,
233 err);
234 args->converter->fromUnicodeStatus=0;
235 }
236
237 myTarget = (unsigned char *) args->target;
238 temp[0] = 0;
239
240 if (args->converter->fromUChar32) {
241 ch = args->converter->fromUChar32;
242 args->converter->fromUChar32 = 0;
243 goto lowsurogate;
244 }
245
246 while (mySource < sourceLimit && myTarget < targetLimit) {
247 ch = *(mySource++);
248
249 if (UTF_IS_SURROGATE(ch)) {
250 if (U_IS_LEAD(ch)) {
251 lowsurogate:
252 if (mySource < sourceLimit) {
253 ch2 = *mySource;
254 if (U_IS_TRAIL(ch2)) {
255 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
256 mySource++;
257 }
258 else {
259 /* this is an unmatched trail code unit (2nd surrogate) */
260 /* callback(illegal) */
261 args->converter->fromUChar32 = ch;
262 *err = U_ILLEGAL_CHAR_FOUND;
263 break;
264 }
265 }
266 else {
267 /* ran out of source */
268 args->converter->fromUChar32 = ch;
269 if (args->flush) {
270 /* this is an unmatched trail code unit (2nd surrogate) */
271 /* callback(illegal) */
272 *err = U_ILLEGAL_CHAR_FOUND;
273 }
274 break;
275 }
276 }
277 else {
278 /* this is an unmatched trail code unit (2nd surrogate) */
279 /* callback(illegal) */
280 args->converter->fromUChar32 = ch;
281 *err = U_ILLEGAL_CHAR_FOUND;
282 break;
283 }
284 }
285
286 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
287 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
288 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
289 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
290
291 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
292 if (myTarget < targetLimit) {
293 *(myTarget++) = temp[indexToWrite];
294 }
295 else {
296 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
297 *err = U_BUFFER_OVERFLOW_ERROR;
298 }
299 }
300 }
301
302 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
303 *err = U_BUFFER_OVERFLOW_ERROR;
304 }
305
306 args->target = (char *) myTarget;
307 args->source = mySource;
308 }
309
310 static void
T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
312 UErrorCode * err)
313 {
314 const UChar *mySource = args->source;
315 unsigned char *myTarget;
316 int32_t *myOffsets;
317 const UChar *sourceLimit = args->sourceLimit;
318 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
319 UChar32 ch, ch2;
320 int32_t offsetNum = 0;
321 unsigned int indexToWrite;
322 unsigned char temp[sizeof(uint32_t)];
323
324 if(mySource >= sourceLimit) {
325 /* no input, nothing to do */
326 return;
327 }
328
329 /* write the BOM if necessary */
330 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
331 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
332 ucnv_fromUWriteBytes(args->converter,
333 bom, 4,
334 &args->target, args->targetLimit,
335 &args->offsets, -1,
336 err);
337 args->converter->fromUnicodeStatus=0;
338 }
339
340 myTarget = (unsigned char *) args->target;
341 myOffsets = args->offsets;
342 temp[0] = 0;
343
344 if (args->converter->fromUChar32) {
345 ch = args->converter->fromUChar32;
346 args->converter->fromUChar32 = 0;
347 goto lowsurogate;
348 }
349
350 while (mySource < sourceLimit && myTarget < targetLimit) {
351 ch = *(mySource++);
352
353 if (UTF_IS_SURROGATE(ch)) {
354 if (U_IS_LEAD(ch)) {
355 lowsurogate:
356 if (mySource < sourceLimit) {
357 ch2 = *mySource;
358 if (U_IS_TRAIL(ch2)) {
359 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
360 mySource++;
361 }
362 else {
363 /* this is an unmatched trail code unit (2nd surrogate) */
364 /* callback(illegal) */
365 args->converter->fromUChar32 = ch;
366 *err = U_ILLEGAL_CHAR_FOUND;
367 break;
368 }
369 }
370 else {
371 /* ran out of source */
372 args->converter->fromUChar32 = ch;
373 if (args->flush) {
374 /* this is an unmatched trail code unit (2nd surrogate) */
375 /* callback(illegal) */
376 *err = U_ILLEGAL_CHAR_FOUND;
377 }
378 break;
379 }
380 }
381 else {
382 /* this is an unmatched trail code unit (2nd surrogate) */
383 /* callback(illegal) */
384 args->converter->fromUChar32 = ch;
385 *err = U_ILLEGAL_CHAR_FOUND;
386 break;
387 }
388 }
389
390 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
391 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
392 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
393 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
394
395 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
396 if (myTarget < targetLimit) {
397 *(myTarget++) = temp[indexToWrite];
398 *(myOffsets++) = offsetNum;
399 }
400 else {
401 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
402 *err = U_BUFFER_OVERFLOW_ERROR;
403 }
404 }
405 offsetNum = offsetNum + 1 + (temp[1] != 0);
406 }
407
408 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
409 *err = U_BUFFER_OVERFLOW_ERROR;
410 }
411
412 args->target = (char *) myTarget;
413 args->source = mySource;
414 args->offsets = myOffsets;
415 }
416
417 static UChar32
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
419 UErrorCode* err)
420 {
421 const uint8_t *mySource;
422 UChar32 myUChar;
423 int32_t length;
424
425 mySource = (const uint8_t *)args->source;
426 if (mySource >= (const uint8_t *)args->sourceLimit)
427 {
428 /* no input */
429 *err = U_INDEX_OUTOFBOUNDS_ERROR;
430 return 0xffff;
431 }
432
433 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
434 if (length < 4)
435 {
436 /* got a partial character */
437 uprv_memcpy(args->converter->toUBytes, mySource, length);
438 args->converter->toULength = (int8_t)length;
439 args->source = (const char *)(mySource + length);
440 *err = U_TRUNCATED_CHAR_FOUND;
441 return 0xffff;
442 }
443
444 /* Don't even try to do a direct cast because the value may be on an odd address. */
445 myUChar = ((UChar32)mySource[0] << 24)
446 | ((UChar32)mySource[1] << 16)
447 | ((UChar32)mySource[2] << 8)
448 | ((UChar32)mySource[3]);
449
450 args->source = (const char *)(mySource + 4);
451 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
452 return myUChar;
453 }
454
455 uprv_memcpy(args->converter->toUBytes, mySource, 4);
456 args->converter->toULength = 4;
457
458 *err = U_ILLEGAL_CHAR_FOUND;
459 return 0xffff;
460 }
461
462 static const UConverterImpl _UTF32BEImpl = {
463 UCNV_UTF32_BigEndian,
464
465 NULL,
466 NULL,
467
468 NULL,
469 NULL,
470 NULL,
471
472 T_UConverter_toUnicode_UTF32_BE,
473 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
474 T_UConverter_fromUnicode_UTF32_BE,
475 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
476 T_UConverter_getNextUChar_UTF32_BE,
477
478 NULL,
479 NULL,
480 NULL,
481 NULL,
482 ucnv_getNonSurrogateUnicodeSet
483 };
484
485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
486 static const UConverterStaticData _UTF32BEStaticData = {
487 sizeof(UConverterStaticData),
488 "UTF-32BE",
489 1232,
490 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
491 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
492 0,
493 0,
494 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
495 };
496
497 const UConverterSharedData _UTF32BEData = {
498 sizeof(UConverterSharedData), ~((uint32_t) 0),
499 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
500 0
501 };
502
503 /* UTF-32LE ---------------------------------------------------------- */
504
505 static void
T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
507 UErrorCode * err)
508 {
509 const unsigned char *mySource = (unsigned char *) args->source;
510 UChar *myTarget = args->target;
511 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
512 const UChar *targetLimit = args->targetLimit;
513 unsigned char *toUBytes = args->converter->toUBytes;
514 uint32_t ch, i;
515
516 /* Restore state of current sequence */
517 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
518 {
519 i = args->converter->toULength; /* restore # of bytes consumed */
520 args->converter->toULength = 0;
521
522 /* Stores the previously calculated ch from a previous call*/
523 ch = args->converter->toUnicodeStatus - 1;
524 args->converter->toUnicodeStatus = 0;
525 goto morebytes;
526 }
527
528 while (mySource < sourceLimit && myTarget < targetLimit)
529 {
530 i = 0;
531 ch = 0;
532 morebytes:
533 while (i < sizeof(uint32_t))
534 {
535 if (mySource < sourceLimit)
536 {
537 ch |= ((uint8_t)(*mySource)) << (i * 8);
538 toUBytes[i++] = (char) *(mySource++);
539 }
540 else
541 {
542 /* stores a partially calculated target*/
543 /* + 1 to make 0 a valid character */
544 args->converter->toUnicodeStatus = ch + 1;
545 args->converter->toULength = (int8_t) i;
546 goto donefornow;
547 }
548 }
549
550 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
551 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
552 if (ch <= MAXIMUM_UCS2) {
553 /* fits in 16 bits */
554 *(myTarget++) = (UChar) ch;
555 }
556 else {
557 /* write out the surrogates */
558 *(myTarget++) = U16_LEAD(ch);
559 ch = U16_TRAIL(ch);
560 if (myTarget < targetLimit) {
561 *(myTarget++) = (UChar)ch;
562 }
563 else {
564 /* Put in overflow buffer (not handled here) */
565 args->converter->UCharErrorBuffer[0] = (UChar) ch;
566 args->converter->UCharErrorBufferLength = 1;
567 *err = U_BUFFER_OVERFLOW_ERROR;
568 break;
569 }
570 }
571 }
572 else {
573 args->converter->toULength = (int8_t)i;
574 *err = U_ILLEGAL_CHAR_FOUND;
575 break;
576 }
577 }
578
579 donefornow:
580 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
581 {
582 /* End of target buffer */
583 *err = U_BUFFER_OVERFLOW_ERROR;
584 }
585
586 args->target = myTarget;
587 args->source = (const char *) mySource;
588 }
589
590 static void
T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
592 UErrorCode * err)
593 {
594 const unsigned char *mySource = (unsigned char *) args->source;
595 UChar *myTarget = args->target;
596 int32_t *myOffsets = args->offsets;
597 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
598 const UChar *targetLimit = args->targetLimit;
599 unsigned char *toUBytes = args->converter->toUBytes;
600 uint32_t ch, i;
601 int32_t offsetNum = 0;
602
603 /* Restore state of current sequence */
604 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
605 {
606 i = args->converter->toULength; /* restore # of bytes consumed */
607 args->converter->toULength = 0;
608
609 /* Stores the previously calculated ch from a previous call*/
610 ch = args->converter->toUnicodeStatus - 1;
611 args->converter->toUnicodeStatus = 0;
612 goto morebytes;
613 }
614
615 while (mySource < sourceLimit && myTarget < targetLimit)
616 {
617 i = 0;
618 ch = 0;
619 morebytes:
620 while (i < sizeof(uint32_t))
621 {
622 if (mySource < sourceLimit)
623 {
624 ch |= ((uint8_t)(*mySource)) << (i * 8);
625 toUBytes[i++] = (char) *(mySource++);
626 }
627 else
628 {
629 /* stores a partially calculated target*/
630 /* + 1 to make 0 a valid character */
631 args->converter->toUnicodeStatus = ch + 1;
632 args->converter->toULength = (int8_t) i;
633 goto donefornow;
634 }
635 }
636
637 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
638 {
639 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
640 if (ch <= MAXIMUM_UCS2)
641 {
642 /* fits in 16 bits */
643 *(myTarget++) = (UChar) ch;
644 *(myOffsets++) = offsetNum;
645 }
646 else {
647 /* write out the surrogates */
648 *(myTarget++) = U16_LEAD(ch);
649 *(myOffsets++) = offsetNum;
650 ch = U16_TRAIL(ch);
651 if (myTarget < targetLimit)
652 {
653 *(myTarget++) = (UChar)ch;
654 *(myOffsets++) = offsetNum;
655 }
656 else
657 {
658 /* Put in overflow buffer (not handled here) */
659 args->converter->UCharErrorBuffer[0] = (UChar) ch;
660 args->converter->UCharErrorBufferLength = 1;
661 *err = U_BUFFER_OVERFLOW_ERROR;
662 break;
663 }
664 }
665 }
666 else
667 {
668 args->converter->toULength = (int8_t)i;
669 *err = U_ILLEGAL_CHAR_FOUND;
670 break;
671 }
672 offsetNum += i;
673 }
674
675 donefornow:
676 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
677 {
678 /* End of target buffer */
679 *err = U_BUFFER_OVERFLOW_ERROR;
680 }
681
682 args->target = myTarget;
683 args->source = (const char *) mySource;
684 args->offsets = myOffsets;
685 }
686
687 static void
T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,UErrorCode * err)688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
689 UErrorCode * err)
690 {
691 const UChar *mySource = args->source;
692 unsigned char *myTarget;
693 const UChar *sourceLimit = args->sourceLimit;
694 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
695 UChar32 ch, ch2;
696 unsigned int indexToWrite;
697 unsigned char temp[sizeof(uint32_t)];
698
699 if(mySource >= sourceLimit) {
700 /* no input, nothing to do */
701 return;
702 }
703
704 /* write the BOM if necessary */
705 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
706 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
707 ucnv_fromUWriteBytes(args->converter,
708 bom, 4,
709 &args->target, args->targetLimit,
710 &args->offsets, -1,
711 err);
712 args->converter->fromUnicodeStatus=0;
713 }
714
715 myTarget = (unsigned char *) args->target;
716 temp[3] = 0;
717
718 if (args->converter->fromUChar32)
719 {
720 ch = args->converter->fromUChar32;
721 args->converter->fromUChar32 = 0;
722 goto lowsurogate;
723 }
724
725 while (mySource < sourceLimit && myTarget < targetLimit)
726 {
727 ch = *(mySource++);
728
729 if (UTF_IS_SURROGATE(ch)) {
730 if (U_IS_LEAD(ch))
731 {
732 lowsurogate:
733 if (mySource < sourceLimit)
734 {
735 ch2 = *mySource;
736 if (U_IS_TRAIL(ch2)) {
737 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
738 mySource++;
739 }
740 else {
741 /* this is an unmatched trail code unit (2nd surrogate) */
742 /* callback(illegal) */
743 args->converter->fromUChar32 = ch;
744 *err = U_ILLEGAL_CHAR_FOUND;
745 break;
746 }
747 }
748 else {
749 /* ran out of source */
750 args->converter->fromUChar32 = ch;
751 if (args->flush) {
752 /* this is an unmatched trail code unit (2nd surrogate) */
753 /* callback(illegal) */
754 *err = U_ILLEGAL_CHAR_FOUND;
755 }
756 break;
757 }
758 }
759 else {
760 /* this is an unmatched trail code unit (2nd surrogate) */
761 /* callback(illegal) */
762 args->converter->fromUChar32 = ch;
763 *err = U_ILLEGAL_CHAR_FOUND;
764 break;
765 }
766 }
767
768 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
769 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
770 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
771 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
772
773 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
774 {
775 if (myTarget < targetLimit)
776 {
777 *(myTarget++) = temp[indexToWrite];
778 }
779 else
780 {
781 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
782 *err = U_BUFFER_OVERFLOW_ERROR;
783 }
784 }
785 }
786
787 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
788 {
789 *err = U_BUFFER_OVERFLOW_ERROR;
790 }
791
792 args->target = (char *) myTarget;
793 args->source = mySource;
794 }
795
796 static void
T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
798 UErrorCode * err)
799 {
800 const UChar *mySource = args->source;
801 unsigned char *myTarget;
802 int32_t *myOffsets;
803 const UChar *sourceLimit = args->sourceLimit;
804 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
805 UChar32 ch, ch2;
806 unsigned int indexToWrite;
807 unsigned char temp[sizeof(uint32_t)];
808 int32_t offsetNum = 0;
809
810 if(mySource >= sourceLimit) {
811 /* no input, nothing to do */
812 return;
813 }
814
815 /* write the BOM if necessary */
816 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
817 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
818 ucnv_fromUWriteBytes(args->converter,
819 bom, 4,
820 &args->target, args->targetLimit,
821 &args->offsets, -1,
822 err);
823 args->converter->fromUnicodeStatus=0;
824 }
825
826 myTarget = (unsigned char *) args->target;
827 myOffsets = args->offsets;
828 temp[3] = 0;
829
830 if (args->converter->fromUChar32)
831 {
832 ch = args->converter->fromUChar32;
833 args->converter->fromUChar32 = 0;
834 goto lowsurogate;
835 }
836
837 while (mySource < sourceLimit && myTarget < targetLimit)
838 {
839 ch = *(mySource++);
840
841 if (UTF_IS_SURROGATE(ch)) {
842 if (U_IS_LEAD(ch))
843 {
844 lowsurogate:
845 if (mySource < sourceLimit)
846 {
847 ch2 = *mySource;
848 if (U_IS_TRAIL(ch2))
849 {
850 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
851 mySource++;
852 }
853 else {
854 /* this is an unmatched trail code unit (2nd surrogate) */
855 /* callback(illegal) */
856 args->converter->fromUChar32 = ch;
857 *err = U_ILLEGAL_CHAR_FOUND;
858 break;
859 }
860 }
861 else {
862 /* ran out of source */
863 args->converter->fromUChar32 = ch;
864 if (args->flush) {
865 /* this is an unmatched trail code unit (2nd surrogate) */
866 /* callback(illegal) */
867 *err = U_ILLEGAL_CHAR_FOUND;
868 }
869 break;
870 }
871 }
872 else {
873 /* this is an unmatched trail code unit (2nd surrogate) */
874 /* callback(illegal) */
875 args->converter->fromUChar32 = ch;
876 *err = U_ILLEGAL_CHAR_FOUND;
877 break;
878 }
879 }
880
881 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
882 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
883 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
884 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
885
886 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
887 {
888 if (myTarget < targetLimit)
889 {
890 *(myTarget++) = temp[indexToWrite];
891 *(myOffsets++) = offsetNum;
892 }
893 else
894 {
895 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
896 *err = U_BUFFER_OVERFLOW_ERROR;
897 }
898 }
899 offsetNum = offsetNum + 1 + (temp[2] != 0);
900 }
901
902 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
903 {
904 *err = U_BUFFER_OVERFLOW_ERROR;
905 }
906
907 args->target = (char *) myTarget;
908 args->source = mySource;
909 args->offsets = myOffsets;
910 }
911
912 static UChar32
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
914 UErrorCode* err)
915 {
916 const uint8_t *mySource;
917 UChar32 myUChar;
918 int32_t length;
919
920 mySource = (const uint8_t *)args->source;
921 if (mySource >= (const uint8_t *)args->sourceLimit)
922 {
923 /* no input */
924 *err = U_INDEX_OUTOFBOUNDS_ERROR;
925 return 0xffff;
926 }
927
928 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
929 if (length < 4)
930 {
931 /* got a partial character */
932 uprv_memcpy(args->converter->toUBytes, mySource, length);
933 args->converter->toULength = (int8_t)length;
934 args->source = (const char *)(mySource + length);
935 *err = U_TRUNCATED_CHAR_FOUND;
936 return 0xffff;
937 }
938
939 /* Don't even try to do a direct cast because the value may be on an odd address. */
940 myUChar = ((UChar32)mySource[3] << 24)
941 | ((UChar32)mySource[2] << 16)
942 | ((UChar32)mySource[1] << 8)
943 | ((UChar32)mySource[0]);
944
945 args->source = (const char *)(mySource + 4);
946 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
947 return myUChar;
948 }
949
950 uprv_memcpy(args->converter->toUBytes, mySource, 4);
951 args->converter->toULength = 4;
952
953 *err = U_ILLEGAL_CHAR_FOUND;
954 return 0xffff;
955 }
956
957 static const UConverterImpl _UTF32LEImpl = {
958 UCNV_UTF32_LittleEndian,
959
960 NULL,
961 NULL,
962
963 NULL,
964 NULL,
965 NULL,
966
967 T_UConverter_toUnicode_UTF32_LE,
968 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
969 T_UConverter_fromUnicode_UTF32_LE,
970 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
971 T_UConverter_getNextUChar_UTF32_LE,
972
973 NULL,
974 NULL,
975 NULL,
976 NULL,
977 ucnv_getNonSurrogateUnicodeSet
978 };
979
980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
981 static const UConverterStaticData _UTF32LEStaticData = {
982 sizeof(UConverterStaticData),
983 "UTF-32LE",
984 1234,
985 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
986 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
987 0,
988 0,
989 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
990 };
991
992
993 const UConverterSharedData _UTF32LEData = {
994 sizeof(UConverterSharedData), ~((uint32_t) 0),
995 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
996 0
997 };
998
999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1000
1001 /*
1002 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1003 * accordingly.
1004 *
1005 * State values:
1006 * 0 initial state
1007 * 1 saw 00
1008 * 2 saw 00 00
1009 * 3 saw 00 00 FE
1010 * 4 -
1011 * 5 saw FF
1012 * 6 saw FF FE
1013 * 7 saw FF FE 00
1014 * 8 UTF-32BE mode
1015 * 9 UTF-32LE mode
1016 *
1017 * During detection: state&3==number of matching bytes so far.
1018 *
1019 * On output, emit U+FEFF as the first code point.
1020 */
1021
1022 static void
_UTF32Reset(UConverter * cnv,UConverterResetChoice choice)1023 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1024 if(choice<=UCNV_RESET_TO_UNICODE) {
1025 /* reset toUnicode: state=0 */
1026 cnv->mode=0;
1027 }
1028 if(choice!=UCNV_RESET_TO_UNICODE) {
1029 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1030 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1031 }
1032 }
1033
1034 static void
_UTF32Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1035 _UTF32Open(UConverter *cnv,
1036 UConverterLoadArgs *pArgs,
1037 UErrorCode *pErrorCode) {
1038 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1039 }
1040
1041 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1042
1043 static void
_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1044 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1045 UErrorCode *pErrorCode) {
1046 UConverter *cnv=pArgs->converter;
1047 const char *source=pArgs->source;
1048 const char *sourceLimit=pArgs->sourceLimit;
1049 int32_t *offsets=pArgs->offsets;
1050
1051 int32_t state, offsetDelta;
1052 char b;
1053
1054 state=cnv->mode;
1055
1056 /*
1057 * If we detect a BOM in this buffer, then we must add the BOM size to the
1058 * offsets because the actual converter function will not see and count the BOM.
1059 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1060 */
1061 offsetDelta=0;
1062
1063 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1064 switch(state) {
1065 case 0:
1066 b=*source;
1067 if(b==0) {
1068 state=1; /* could be 00 00 FE FF */
1069 } else if(b==(char)0xff) {
1070 state=5; /* could be FF FE 00 00 */
1071 } else {
1072 state=8; /* default to UTF-32BE */
1073 continue;
1074 }
1075 ++source;
1076 break;
1077 case 1:
1078 case 2:
1079 case 3:
1080 case 5:
1081 case 6:
1082 case 7:
1083 if(*source==utf32BOM[state]) {
1084 ++state;
1085 ++source;
1086 if(state==4) {
1087 state=8; /* detect UTF-32BE */
1088 offsetDelta=(int32_t)(source-pArgs->source);
1089 } else if(state==8) {
1090 state=9; /* detect UTF-32LE */
1091 offsetDelta=(int32_t)(source-pArgs->source);
1092 }
1093 } else {
1094 /* switch to UTF-32BE and pass the previous bytes */
1095 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1096
1097 /* reset the source */
1098 source=pArgs->source;
1099
1100 if(count==(state&3)) {
1101 /* simple: all in the same buffer, just reset source */
1102 } else {
1103 UBool oldFlush=pArgs->flush;
1104
1105 /* some of the bytes are from a previous buffer, replay those first */
1106 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1107 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1108 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1109
1110 /* no offsets: bytes from previous buffer, and not enough for output */
1111 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1112
1113 /* restore real pointers; pArgs->source will be set in case 8/9 */
1114 pArgs->sourceLimit=sourceLimit;
1115 pArgs->flush=oldFlush;
1116 }
1117 state=8;
1118 continue;
1119 }
1120 break;
1121 case 8:
1122 /* call UTF-32BE */
1123 pArgs->source=source;
1124 if(offsets==NULL) {
1125 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1126 } else {
1127 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1128 }
1129 source=pArgs->source;
1130 break;
1131 case 9:
1132 /* call UTF-32LE */
1133 pArgs->source=source;
1134 if(offsets==NULL) {
1135 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1136 } else {
1137 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1138 }
1139 source=pArgs->source;
1140 break;
1141 default:
1142 break; /* does not occur */
1143 }
1144 }
1145
1146 /* add BOM size to offsets - see comment at offsetDelta declaration */
1147 if(offsets!=NULL && offsetDelta!=0) {
1148 int32_t *offsetsLimit=pArgs->offsets;
1149 while(offsets<offsetsLimit) {
1150 *offsets++ += offsetDelta;
1151 }
1152 }
1153
1154 pArgs->source=source;
1155
1156 if(source==sourceLimit && pArgs->flush) {
1157 /* handle truncated input */
1158 switch(state) {
1159 case 0:
1160 break; /* no input at all, nothing to do */
1161 case 8:
1162 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1163 break;
1164 case 9:
1165 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1166 break;
1167 default:
1168 /* handle 0<state<8: call UTF-32BE with too-short input */
1169 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1170 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1171
1172 /* no offsets: not enough for output */
1173 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1174 pArgs->source=source;
1175 pArgs->sourceLimit=sourceLimit;
1176 state=8;
1177 break;
1178 }
1179 }
1180
1181 cnv->mode=state;
1182 }
1183
1184 static UChar32
_UTF32GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1185 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1186 UErrorCode *pErrorCode) {
1187 switch(pArgs->converter->mode) {
1188 case 8:
1189 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1190 case 9:
1191 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1192 default:
1193 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1194 }
1195 }
1196
1197 static const UConverterImpl _UTF32Impl = {
1198 UCNV_UTF32,
1199
1200 NULL,
1201 NULL,
1202
1203 _UTF32Open,
1204 NULL,
1205 _UTF32Reset,
1206
1207 _UTF32ToUnicodeWithOffsets,
1208 _UTF32ToUnicodeWithOffsets,
1209 #if U_IS_BIG_ENDIAN
1210 T_UConverter_fromUnicode_UTF32_BE,
1211 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1212 #else
1213 T_UConverter_fromUnicode_UTF32_LE,
1214 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1215 #endif
1216 _UTF32GetNextUChar,
1217
1218 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1219 NULL,
1220 NULL,
1221 NULL,
1222 ucnv_getNonSurrogateUnicodeSet
1223 };
1224
1225 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1226 static const UConverterStaticData _UTF32StaticData = {
1227 sizeof(UConverterStaticData),
1228 "UTF-32",
1229 1236,
1230 UCNV_IBM, UCNV_UTF32, 4, 4,
1231 #if U_IS_BIG_ENDIAN
1232 { 0, 0, 0xff, 0xfd }, 4,
1233 #else
1234 { 0xfd, 0xff, 0, 0 }, 4,
1235 #endif
1236 FALSE, FALSE,
1237 0,
1238 0,
1239 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1240 };
1241
1242 const UConverterSharedData _UTF32Data = {
1243 sizeof(UConverterSharedData), ~((uint32_t) 0),
1244 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1245 0
1246 };
1247
1248 #endif
1249