1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "unicode/utf.h"
23 #include "ucnv_bld.h"
24 #include "ucnv_cnv.h"
25 #include "cmemory.h"
26
27 #define MAXIMUM_UCS2 0x0000FFFF
28 #define MAXIMUM_UTF 0x0010FFFF
29 #define HALF_SHIFT 10
30 #define HALF_BASE 0x0010000
31 #define HALF_MASK 0x3FF
32 #define SURROGATE_HIGH_START 0xD800
33 #define SURROGATE_LOW_START 0xDC00
34
35 /* -SURROGATE_LOW_START + HALF_BASE */
36 #define SURROGATE_LOW_BASE 9216
37
38 enum {
39 UCNV_NEED_TO_WRITE_BOM=1
40 };
41
42 /* UTF-32BE ----------------------------------------------------------------- */
43
44 static void
T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
46 UErrorCode * err)
47 {
48 const unsigned char *mySource = (unsigned char *) args->source;
49 UChar *myTarget = args->target;
50 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
51 const UChar *targetLimit = args->targetLimit;
52 unsigned char *toUBytes = args->converter->toUBytes;
53 uint32_t ch, i;
54
55 /* Restore state of current sequence */
56 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
57 i = args->converter->toULength; /* restore # of bytes consumed */
58 args->converter->toULength = 0;
59
60 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
61 args->converter->toUnicodeStatus = 0;
62 goto morebytes;
63 }
64
65 while (mySource < sourceLimit && myTarget < targetLimit) {
66 i = 0;
67 ch = 0;
68 morebytes:
69 while (i < sizeof(uint32_t)) {
70 if (mySource < sourceLimit) {
71 ch = (ch << 8) | (uint8_t)(*mySource);
72 toUBytes[i++] = (char) *(mySource++);
73 }
74 else {
75 /* stores a partially calculated target*/
76 /* + 1 to make 0 a valid character */
77 args->converter->toUnicodeStatus = ch + 1;
78 args->converter->toULength = (int8_t) i;
79 goto donefornow;
80 }
81 }
82
83 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85 if (ch <= MAXIMUM_UCS2)
86 {
87 /* fits in 16 bits */
88 *(myTarget++) = (UChar) ch;
89 }
90 else {
91 /* write out the surrogates */
92 *(myTarget++) = U16_LEAD(ch);
93 ch = U16_TRAIL(ch);
94 if (myTarget < targetLimit) {
95 *(myTarget++) = (UChar)ch;
96 }
97 else {
98 /* Put in overflow buffer (not handled here) */
99 args->converter->UCharErrorBuffer[0] = (UChar) ch;
100 args->converter->UCharErrorBufferLength = 1;
101 *err = U_BUFFER_OVERFLOW_ERROR;
102 break;
103 }
104 }
105 }
106 else {
107 args->converter->toULength = (int8_t)i;
108 *err = U_ILLEGAL_CHAR_FOUND;
109 break;
110 }
111 }
112
113 donefornow:
114 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
115 /* End of target buffer */
116 *err = U_BUFFER_OVERFLOW_ERROR;
117 }
118
119 args->target = myTarget;
120 args->source = (const char *) mySource;
121 }
122
123 static void
T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
125 UErrorCode * err)
126 {
127 const unsigned char *mySource = (unsigned char *) args->source;
128 UChar *myTarget = args->target;
129 int32_t *myOffsets = args->offsets;
130 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
131 const UChar *targetLimit = args->targetLimit;
132 unsigned char *toUBytes = args->converter->toUBytes;
133 uint32_t ch, i;
134 int32_t offsetNum = 0;
135
136 /* Restore state of current sequence */
137 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
138 i = args->converter->toULength; /* restore # of bytes consumed */
139 args->converter->toULength = 0;
140
141 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
142 args->converter->toUnicodeStatus = 0;
143 goto morebytes;
144 }
145
146 while (mySource < sourceLimit && myTarget < targetLimit) {
147 i = 0;
148 ch = 0;
149 morebytes:
150 while (i < sizeof(uint32_t)) {
151 if (mySource < sourceLimit) {
152 ch = (ch << 8) | (uint8_t)(*mySource);
153 toUBytes[i++] = (char) *(mySource++);
154 }
155 else {
156 /* stores a partially calculated target*/
157 /* + 1 to make 0 a valid character */
158 args->converter->toUnicodeStatus = ch + 1;
159 args->converter->toULength = (int8_t) i;
160 goto donefornow;
161 }
162 }
163
164 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166 if (ch <= MAXIMUM_UCS2) {
167 /* fits in 16 bits */
168 *(myTarget++) = (UChar) ch;
169 *(myOffsets++) = offsetNum;
170 }
171 else {
172 /* write out the surrogates */
173 *(myTarget++) = U16_LEAD(ch);
174 *myOffsets++ = offsetNum;
175 ch = U16_TRAIL(ch);
176 if (myTarget < targetLimit)
177 {
178 *(myTarget++) = (UChar)ch;
179 *(myOffsets++) = offsetNum;
180 }
181 else {
182 /* Put in overflow buffer (not handled here) */
183 args->converter->UCharErrorBuffer[0] = (UChar) ch;
184 args->converter->UCharErrorBufferLength = 1;
185 *err = U_BUFFER_OVERFLOW_ERROR;
186 break;
187 }
188 }
189 }
190 else {
191 args->converter->toULength = (int8_t)i;
192 *err = U_ILLEGAL_CHAR_FOUND;
193 break;
194 }
195 offsetNum += i;
196 }
197
198 donefornow:
199 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
200 {
201 /* End of target buffer */
202 *err = U_BUFFER_OVERFLOW_ERROR;
203 }
204
205 args->target = myTarget;
206 args->source = (const char *) mySource;
207 args->offsets = myOffsets;
208 }
209
210 static void
T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,UErrorCode * err)211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
212 UErrorCode * err)
213 {
214 const UChar *mySource = args->source;
215 unsigned char *myTarget;
216 const UChar *sourceLimit = args->sourceLimit;
217 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
218 UChar32 ch, ch2;
219 unsigned int indexToWrite;
220 unsigned char temp[sizeof(uint32_t)];
221
222 if(mySource >= sourceLimit) {
223 /* no input, nothing to do */
224 return;
225 }
226
227 /* write the BOM if necessary */
228 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
229 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
230 ucnv_fromUWriteBytes(args->converter,
231 bom, 4,
232 &args->target, args->targetLimit,
233 &args->offsets, -1,
234 err);
235 args->converter->fromUnicodeStatus=0;
236 }
237
238 myTarget = (unsigned char *) args->target;
239 temp[0] = 0;
240
241 if (args->converter->fromUChar32) {
242 ch = args->converter->fromUChar32;
243 args->converter->fromUChar32 = 0;
244 goto lowsurogate;
245 }
246
247 while (mySource < sourceLimit && myTarget < targetLimit) {
248 ch = *(mySource++);
249
250 if (U_IS_SURROGATE(ch)) {
251 if (U_IS_LEAD(ch)) {
252 lowsurogate:
253 if (mySource < sourceLimit) {
254 ch2 = *mySource;
255 if (U_IS_TRAIL(ch2)) {
256 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
257 mySource++;
258 }
259 else {
260 /* this is an unmatched trail code unit (2nd surrogate) */
261 /* callback(illegal) */
262 args->converter->fromUChar32 = ch;
263 *err = U_ILLEGAL_CHAR_FOUND;
264 break;
265 }
266 }
267 else {
268 /* ran out of source */
269 args->converter->fromUChar32 = ch;
270 if (args->flush) {
271 /* this is an unmatched trail code unit (2nd surrogate) */
272 /* callback(illegal) */
273 *err = U_ILLEGAL_CHAR_FOUND;
274 }
275 break;
276 }
277 }
278 else {
279 /* this is an unmatched trail code unit (2nd surrogate) */
280 /* callback(illegal) */
281 args->converter->fromUChar32 = ch;
282 *err = U_ILLEGAL_CHAR_FOUND;
283 break;
284 }
285 }
286
287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
289 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
290 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
291
292 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
293 if (myTarget < targetLimit) {
294 *(myTarget++) = temp[indexToWrite];
295 }
296 else {
297 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
298 *err = U_BUFFER_OVERFLOW_ERROR;
299 }
300 }
301 }
302
303 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
304 *err = U_BUFFER_OVERFLOW_ERROR;
305 }
306
307 args->target = (char *) myTarget;
308 args->source = mySource;
309 }
310
311 static void
T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
313 UErrorCode * err)
314 {
315 const UChar *mySource = args->source;
316 unsigned char *myTarget;
317 int32_t *myOffsets;
318 const UChar *sourceLimit = args->sourceLimit;
319 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
320 UChar32 ch, ch2;
321 int32_t offsetNum = 0;
322 unsigned int indexToWrite;
323 unsigned char temp[sizeof(uint32_t)];
324
325 if(mySource >= sourceLimit) {
326 /* no input, nothing to do */
327 return;
328 }
329
330 /* write the BOM if necessary */
331 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
332 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
333 ucnv_fromUWriteBytes(args->converter,
334 bom, 4,
335 &args->target, args->targetLimit,
336 &args->offsets, -1,
337 err);
338 args->converter->fromUnicodeStatus=0;
339 }
340
341 myTarget = (unsigned char *) args->target;
342 myOffsets = args->offsets;
343 temp[0] = 0;
344
345 if (args->converter->fromUChar32) {
346 ch = args->converter->fromUChar32;
347 args->converter->fromUChar32 = 0;
348 goto lowsurogate;
349 }
350
351 while (mySource < sourceLimit && myTarget < targetLimit) {
352 ch = *(mySource++);
353
354 if (U_IS_SURROGATE(ch)) {
355 if (U_IS_LEAD(ch)) {
356 lowsurogate:
357 if (mySource < sourceLimit) {
358 ch2 = *mySource;
359 if (U_IS_TRAIL(ch2)) {
360 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
361 mySource++;
362 }
363 else {
364 /* this is an unmatched trail code unit (2nd surrogate) */
365 /* callback(illegal) */
366 args->converter->fromUChar32 = ch;
367 *err = U_ILLEGAL_CHAR_FOUND;
368 break;
369 }
370 }
371 else {
372 /* ran out of source */
373 args->converter->fromUChar32 = ch;
374 if (args->flush) {
375 /* this is an unmatched trail code unit (2nd surrogate) */
376 /* callback(illegal) */
377 *err = U_ILLEGAL_CHAR_FOUND;
378 }
379 break;
380 }
381 }
382 else {
383 /* this is an unmatched trail code unit (2nd surrogate) */
384 /* callback(illegal) */
385 args->converter->fromUChar32 = ch;
386 *err = U_ILLEGAL_CHAR_FOUND;
387 break;
388 }
389 }
390
391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
393 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
394 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
395
396 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
397 if (myTarget < targetLimit) {
398 *(myTarget++) = temp[indexToWrite];
399 *(myOffsets++) = offsetNum;
400 }
401 else {
402 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
403 *err = U_BUFFER_OVERFLOW_ERROR;
404 }
405 }
406 offsetNum = offsetNum + 1 + (temp[1] != 0);
407 }
408
409 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
410 *err = U_BUFFER_OVERFLOW_ERROR;
411 }
412
413 args->target = (char *) myTarget;
414 args->source = mySource;
415 args->offsets = myOffsets;
416 }
417
418 static UChar32
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
420 UErrorCode* err)
421 {
422 const uint8_t *mySource;
423 UChar32 myUChar;
424 int32_t length;
425
426 mySource = (const uint8_t *)args->source;
427 if (mySource >= (const uint8_t *)args->sourceLimit)
428 {
429 /* no input */
430 *err = U_INDEX_OUTOFBOUNDS_ERROR;
431 return 0xffff;
432 }
433
434 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
435 if (length < 4)
436 {
437 /* got a partial character */
438 uprv_memcpy(args->converter->toUBytes, mySource, length);
439 args->converter->toULength = (int8_t)length;
440 args->source = (const char *)(mySource + length);
441 *err = U_TRUNCATED_CHAR_FOUND;
442 return 0xffff;
443 }
444
445 /* Don't even try to do a direct cast because the value may be on an odd address. */
446 myUChar = ((UChar32)mySource[0] << 24)
447 | ((UChar32)mySource[1] << 16)
448 | ((UChar32)mySource[2] << 8)
449 | ((UChar32)mySource[3]);
450
451 args->source = (const char *)(mySource + 4);
452 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
453 return myUChar;
454 }
455
456 uprv_memcpy(args->converter->toUBytes, mySource, 4);
457 args->converter->toULength = 4;
458
459 *err = U_ILLEGAL_CHAR_FOUND;
460 return 0xffff;
461 }
462
463 static const UConverterImpl _UTF32BEImpl = {
464 UCNV_UTF32_BigEndian,
465
466 NULL,
467 NULL,
468
469 NULL,
470 NULL,
471 NULL,
472
473 T_UConverter_toUnicode_UTF32_BE,
474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
475 T_UConverter_fromUnicode_UTF32_BE,
476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
477 T_UConverter_getNextUChar_UTF32_BE,
478
479 NULL,
480 NULL,
481 NULL,
482 NULL,
483 ucnv_getNonSurrogateUnicodeSet
484 };
485
486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487 static const UConverterStaticData _UTF32BEStaticData = {
488 sizeof(UConverterStaticData),
489 "UTF-32BE",
490 1232,
491 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
492 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
493 0,
494 0,
495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
496 };
497
498 const UConverterSharedData _UTF32BEData =
499 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
500
501 /* UTF-32LE ---------------------------------------------------------- */
502
503 static void
T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)504 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
505 UErrorCode * err)
506 {
507 const unsigned char *mySource = (unsigned char *) args->source;
508 UChar *myTarget = args->target;
509 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
510 const UChar *targetLimit = args->targetLimit;
511 unsigned char *toUBytes = args->converter->toUBytes;
512 uint32_t ch, i;
513
514 /* Restore state of current sequence */
515 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
516 {
517 i = args->converter->toULength; /* restore # of bytes consumed */
518 args->converter->toULength = 0;
519
520 /* Stores the previously calculated ch from a previous call*/
521 ch = args->converter->toUnicodeStatus - 1;
522 args->converter->toUnicodeStatus = 0;
523 goto morebytes;
524 }
525
526 while (mySource < sourceLimit && myTarget < targetLimit)
527 {
528 i = 0;
529 ch = 0;
530 morebytes:
531 while (i < sizeof(uint32_t))
532 {
533 if (mySource < sourceLimit)
534 {
535 ch |= ((uint8_t)(*mySource)) << (i * 8);
536 toUBytes[i++] = (char) *(mySource++);
537 }
538 else
539 {
540 /* stores a partially calculated target*/
541 /* + 1 to make 0 a valid character */
542 args->converter->toUnicodeStatus = ch + 1;
543 args->converter->toULength = (int8_t) i;
544 goto donefornow;
545 }
546 }
547
548 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
549 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
550 if (ch <= MAXIMUM_UCS2) {
551 /* fits in 16 bits */
552 *(myTarget++) = (UChar) ch;
553 }
554 else {
555 /* write out the surrogates */
556 *(myTarget++) = U16_LEAD(ch);
557 ch = U16_TRAIL(ch);
558 if (myTarget < targetLimit) {
559 *(myTarget++) = (UChar)ch;
560 }
561 else {
562 /* Put in overflow buffer (not handled here) */
563 args->converter->UCharErrorBuffer[0] = (UChar) ch;
564 args->converter->UCharErrorBufferLength = 1;
565 *err = U_BUFFER_OVERFLOW_ERROR;
566 break;
567 }
568 }
569 }
570 else {
571 args->converter->toULength = (int8_t)i;
572 *err = U_ILLEGAL_CHAR_FOUND;
573 break;
574 }
575 }
576
577 donefornow:
578 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
579 {
580 /* End of target buffer */
581 *err = U_BUFFER_OVERFLOW_ERROR;
582 }
583
584 args->target = myTarget;
585 args->source = (const char *) mySource;
586 }
587
588 static void
T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)589 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
590 UErrorCode * err)
591 {
592 const unsigned char *mySource = (unsigned char *) args->source;
593 UChar *myTarget = args->target;
594 int32_t *myOffsets = args->offsets;
595 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
596 const UChar *targetLimit = args->targetLimit;
597 unsigned char *toUBytes = args->converter->toUBytes;
598 uint32_t ch, i;
599 int32_t offsetNum = 0;
600
601 /* Restore state of current sequence */
602 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
603 {
604 i = args->converter->toULength; /* restore # of bytes consumed */
605 args->converter->toULength = 0;
606
607 /* Stores the previously calculated ch from a previous call*/
608 ch = args->converter->toUnicodeStatus - 1;
609 args->converter->toUnicodeStatus = 0;
610 goto morebytes;
611 }
612
613 while (mySource < sourceLimit && myTarget < targetLimit)
614 {
615 i = 0;
616 ch = 0;
617 morebytes:
618 while (i < sizeof(uint32_t))
619 {
620 if (mySource < sourceLimit)
621 {
622 ch |= ((uint8_t)(*mySource)) << (i * 8);
623 toUBytes[i++] = (char) *(mySource++);
624 }
625 else
626 {
627 /* stores a partially calculated target*/
628 /* + 1 to make 0 a valid character */
629 args->converter->toUnicodeStatus = ch + 1;
630 args->converter->toULength = (int8_t) i;
631 goto donefornow;
632 }
633 }
634
635 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
636 {
637 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
638 if (ch <= MAXIMUM_UCS2)
639 {
640 /* fits in 16 bits */
641 *(myTarget++) = (UChar) ch;
642 *(myOffsets++) = offsetNum;
643 }
644 else {
645 /* write out the surrogates */
646 *(myTarget++) = U16_LEAD(ch);
647 *(myOffsets++) = offsetNum;
648 ch = U16_TRAIL(ch);
649 if (myTarget < targetLimit)
650 {
651 *(myTarget++) = (UChar)ch;
652 *(myOffsets++) = offsetNum;
653 }
654 else
655 {
656 /* Put in overflow buffer (not handled here) */
657 args->converter->UCharErrorBuffer[0] = (UChar) ch;
658 args->converter->UCharErrorBufferLength = 1;
659 *err = U_BUFFER_OVERFLOW_ERROR;
660 break;
661 }
662 }
663 }
664 else
665 {
666 args->converter->toULength = (int8_t)i;
667 *err = U_ILLEGAL_CHAR_FOUND;
668 break;
669 }
670 offsetNum += i;
671 }
672
673 donefornow:
674 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
675 {
676 /* End of target buffer */
677 *err = U_BUFFER_OVERFLOW_ERROR;
678 }
679
680 args->target = myTarget;
681 args->source = (const char *) mySource;
682 args->offsets = myOffsets;
683 }
684
685 static void
T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,UErrorCode * err)686 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
687 UErrorCode * err)
688 {
689 const UChar *mySource = args->source;
690 unsigned char *myTarget;
691 const UChar *sourceLimit = args->sourceLimit;
692 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
693 UChar32 ch, ch2;
694 unsigned int indexToWrite;
695 unsigned char temp[sizeof(uint32_t)];
696
697 if(mySource >= sourceLimit) {
698 /* no input, nothing to do */
699 return;
700 }
701
702 /* write the BOM if necessary */
703 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
704 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
705 ucnv_fromUWriteBytes(args->converter,
706 bom, 4,
707 &args->target, args->targetLimit,
708 &args->offsets, -1,
709 err);
710 args->converter->fromUnicodeStatus=0;
711 }
712
713 myTarget = (unsigned char *) args->target;
714 temp[3] = 0;
715
716 if (args->converter->fromUChar32)
717 {
718 ch = args->converter->fromUChar32;
719 args->converter->fromUChar32 = 0;
720 goto lowsurogate;
721 }
722
723 while (mySource < sourceLimit && myTarget < targetLimit)
724 {
725 ch = *(mySource++);
726
727 if (U16_IS_SURROGATE(ch)) {
728 if (U16_IS_LEAD(ch))
729 {
730 lowsurogate:
731 if (mySource < sourceLimit)
732 {
733 ch2 = *mySource;
734 if (U16_IS_TRAIL(ch2)) {
735 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
736 mySource++;
737 }
738 else {
739 /* this is an unmatched trail code unit (2nd surrogate) */
740 /* callback(illegal) */
741 args->converter->fromUChar32 = ch;
742 *err = U_ILLEGAL_CHAR_FOUND;
743 break;
744 }
745 }
746 else {
747 /* ran out of source */
748 args->converter->fromUChar32 = ch;
749 if (args->flush) {
750 /* this is an unmatched trail code unit (2nd surrogate) */
751 /* callback(illegal) */
752 *err = U_ILLEGAL_CHAR_FOUND;
753 }
754 break;
755 }
756 }
757 else {
758 /* this is an unmatched trail code unit (2nd surrogate) */
759 /* callback(illegal) */
760 args->converter->fromUChar32 = ch;
761 *err = U_ILLEGAL_CHAR_FOUND;
762 break;
763 }
764 }
765
766 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
767 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
768 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
769 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
770
771 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
772 {
773 if (myTarget < targetLimit)
774 {
775 *(myTarget++) = temp[indexToWrite];
776 }
777 else
778 {
779 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
780 *err = U_BUFFER_OVERFLOW_ERROR;
781 }
782 }
783 }
784
785 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
786 {
787 *err = U_BUFFER_OVERFLOW_ERROR;
788 }
789
790 args->target = (char *) myTarget;
791 args->source = mySource;
792 }
793
794 static void
T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)795 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
796 UErrorCode * err)
797 {
798 const UChar *mySource = args->source;
799 unsigned char *myTarget;
800 int32_t *myOffsets;
801 const UChar *sourceLimit = args->sourceLimit;
802 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
803 UChar32 ch, ch2;
804 unsigned int indexToWrite;
805 unsigned char temp[sizeof(uint32_t)];
806 int32_t offsetNum = 0;
807
808 if(mySource >= sourceLimit) {
809 /* no input, nothing to do */
810 return;
811 }
812
813 /* write the BOM if necessary */
814 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
815 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
816 ucnv_fromUWriteBytes(args->converter,
817 bom, 4,
818 &args->target, args->targetLimit,
819 &args->offsets, -1,
820 err);
821 args->converter->fromUnicodeStatus=0;
822 }
823
824 myTarget = (unsigned char *) args->target;
825 myOffsets = args->offsets;
826 temp[3] = 0;
827
828 if (args->converter->fromUChar32)
829 {
830 ch = args->converter->fromUChar32;
831 args->converter->fromUChar32 = 0;
832 goto lowsurogate;
833 }
834
835 while (mySource < sourceLimit && myTarget < targetLimit)
836 {
837 ch = *(mySource++);
838
839 if (U16_IS_SURROGATE(ch)) {
840 if (U16_IS_LEAD(ch))
841 {
842 lowsurogate:
843 if (mySource < sourceLimit)
844 {
845 ch2 = *mySource;
846 if (U16_IS_TRAIL(ch2))
847 {
848 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
849 mySource++;
850 }
851 else {
852 /* this is an unmatched trail code unit (2nd surrogate) */
853 /* callback(illegal) */
854 args->converter->fromUChar32 = ch;
855 *err = U_ILLEGAL_CHAR_FOUND;
856 break;
857 }
858 }
859 else {
860 /* ran out of source */
861 args->converter->fromUChar32 = ch;
862 if (args->flush) {
863 /* this is an unmatched trail code unit (2nd surrogate) */
864 /* callback(illegal) */
865 *err = U_ILLEGAL_CHAR_FOUND;
866 }
867 break;
868 }
869 }
870 else {
871 /* this is an unmatched trail code unit (2nd surrogate) */
872 /* callback(illegal) */
873 args->converter->fromUChar32 = ch;
874 *err = U_ILLEGAL_CHAR_FOUND;
875 break;
876 }
877 }
878
879 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
880 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
881 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
882 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
883
884 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
885 {
886 if (myTarget < targetLimit)
887 {
888 *(myTarget++) = temp[indexToWrite];
889 *(myOffsets++) = offsetNum;
890 }
891 else
892 {
893 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
894 *err = U_BUFFER_OVERFLOW_ERROR;
895 }
896 }
897 offsetNum = offsetNum + 1 + (temp[2] != 0);
898 }
899
900 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
901 {
902 *err = U_BUFFER_OVERFLOW_ERROR;
903 }
904
905 args->target = (char *) myTarget;
906 args->source = mySource;
907 args->offsets = myOffsets;
908 }
909
910 static UChar32
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)911 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
912 UErrorCode* err)
913 {
914 const uint8_t *mySource;
915 UChar32 myUChar;
916 int32_t length;
917
918 mySource = (const uint8_t *)args->source;
919 if (mySource >= (const uint8_t *)args->sourceLimit)
920 {
921 /* no input */
922 *err = U_INDEX_OUTOFBOUNDS_ERROR;
923 return 0xffff;
924 }
925
926 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
927 if (length < 4)
928 {
929 /* got a partial character */
930 uprv_memcpy(args->converter->toUBytes, mySource, length);
931 args->converter->toULength = (int8_t)length;
932 args->source = (const char *)(mySource + length);
933 *err = U_TRUNCATED_CHAR_FOUND;
934 return 0xffff;
935 }
936
937 /* Don't even try to do a direct cast because the value may be on an odd address. */
938 myUChar = ((UChar32)mySource[3] << 24)
939 | ((UChar32)mySource[2] << 16)
940 | ((UChar32)mySource[1] << 8)
941 | ((UChar32)mySource[0]);
942
943 args->source = (const char *)(mySource + 4);
944 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
945 return myUChar;
946 }
947
948 uprv_memcpy(args->converter->toUBytes, mySource, 4);
949 args->converter->toULength = 4;
950
951 *err = U_ILLEGAL_CHAR_FOUND;
952 return 0xffff;
953 }
954
955 static const UConverterImpl _UTF32LEImpl = {
956 UCNV_UTF32_LittleEndian,
957
958 NULL,
959 NULL,
960
961 NULL,
962 NULL,
963 NULL,
964
965 T_UConverter_toUnicode_UTF32_LE,
966 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
967 T_UConverter_fromUnicode_UTF32_LE,
968 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
969 T_UConverter_getNextUChar_UTF32_LE,
970
971 NULL,
972 NULL,
973 NULL,
974 NULL,
975 ucnv_getNonSurrogateUnicodeSet
976 };
977
978 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
979 static const UConverterStaticData _UTF32LEStaticData = {
980 sizeof(UConverterStaticData),
981 "UTF-32LE",
982 1234,
983 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
984 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
985 0,
986 0,
987 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
988 };
989
990
991 const UConverterSharedData _UTF32LEData =
992 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
993
994 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
995
996 /*
997 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
998 * accordingly.
999 *
1000 * State values:
1001 * 0 initial state
1002 * 1 saw 00
1003 * 2 saw 00 00
1004 * 3 saw 00 00 FE
1005 * 4 -
1006 * 5 saw FF
1007 * 6 saw FF FE
1008 * 7 saw FF FE 00
1009 * 8 UTF-32BE mode
1010 * 9 UTF-32LE mode
1011 *
1012 * During detection: state&3==number of matching bytes so far.
1013 *
1014 * On output, emit U+FEFF as the first code point.
1015 */
1016
1017 static void
_UTF32Reset(UConverter * cnv,UConverterResetChoice choice)1018 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1019 if(choice<=UCNV_RESET_TO_UNICODE) {
1020 /* reset toUnicode: state=0 */
1021 cnv->mode=0;
1022 }
1023 if(choice!=UCNV_RESET_TO_UNICODE) {
1024 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1025 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1026 }
1027 }
1028
1029 static void
_UTF32Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1030 _UTF32Open(UConverter *cnv,
1031 UConverterLoadArgs *pArgs,
1032 UErrorCode *pErrorCode) {
1033 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1034 }
1035
1036 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1037
1038 static void
_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1039 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1040 UErrorCode *pErrorCode) {
1041 UConverter *cnv=pArgs->converter;
1042 const char *source=pArgs->source;
1043 const char *sourceLimit=pArgs->sourceLimit;
1044 int32_t *offsets=pArgs->offsets;
1045
1046 int32_t state, offsetDelta;
1047 char b;
1048
1049 state=cnv->mode;
1050
1051 /*
1052 * If we detect a BOM in this buffer, then we must add the BOM size to the
1053 * offsets because the actual converter function will not see and count the BOM.
1054 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1055 */
1056 offsetDelta=0;
1057
1058 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1059 switch(state) {
1060 case 0:
1061 b=*source;
1062 if(b==0) {
1063 state=1; /* could be 00 00 FE FF */
1064 } else if(b==(char)0xff) {
1065 state=5; /* could be FF FE 00 00 */
1066 } else {
1067 state=8; /* default to UTF-32BE */
1068 continue;
1069 }
1070 ++source;
1071 break;
1072 case 1:
1073 case 2:
1074 case 3:
1075 case 5:
1076 case 6:
1077 case 7:
1078 if(*source==utf32BOM[state]) {
1079 ++state;
1080 ++source;
1081 if(state==4) {
1082 state=8; /* detect UTF-32BE */
1083 offsetDelta=(int32_t)(source-pArgs->source);
1084 } else if(state==8) {
1085 state=9; /* detect UTF-32LE */
1086 offsetDelta=(int32_t)(source-pArgs->source);
1087 }
1088 } else {
1089 /* switch to UTF-32BE and pass the previous bytes */
1090 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1091
1092 /* reset the source */
1093 source=pArgs->source;
1094
1095 if(count==(state&3)) {
1096 /* simple: all in the same buffer, just reset source */
1097 } else {
1098 UBool oldFlush=pArgs->flush;
1099
1100 /* some of the bytes are from a previous buffer, replay those first */
1101 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1102 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1103 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1104
1105 /* no offsets: bytes from previous buffer, and not enough for output */
1106 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1107
1108 /* restore real pointers; pArgs->source will be set in case 8/9 */
1109 pArgs->sourceLimit=sourceLimit;
1110 pArgs->flush=oldFlush;
1111 }
1112 state=8;
1113 continue;
1114 }
1115 break;
1116 case 8:
1117 /* call UTF-32BE */
1118 pArgs->source=source;
1119 if(offsets==NULL) {
1120 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1121 } else {
1122 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1123 }
1124 source=pArgs->source;
1125 break;
1126 case 9:
1127 /* call UTF-32LE */
1128 pArgs->source=source;
1129 if(offsets==NULL) {
1130 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1131 } else {
1132 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1133 }
1134 source=pArgs->source;
1135 break;
1136 default:
1137 break; /* does not occur */
1138 }
1139 }
1140
1141 /* add BOM size to offsets - see comment at offsetDelta declaration */
1142 if(offsets!=NULL && offsetDelta!=0) {
1143 int32_t *offsetsLimit=pArgs->offsets;
1144 while(offsets<offsetsLimit) {
1145 *offsets++ += offsetDelta;
1146 }
1147 }
1148
1149 pArgs->source=source;
1150
1151 if(source==sourceLimit && pArgs->flush) {
1152 /* handle truncated input */
1153 switch(state) {
1154 case 0:
1155 break; /* no input at all, nothing to do */
1156 case 8:
1157 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1158 break;
1159 case 9:
1160 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1161 break;
1162 default:
1163 /* handle 0<state<8: call UTF-32BE with too-short input */
1164 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1165 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1166
1167 /* no offsets: not enough for output */
1168 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1169 pArgs->source=source;
1170 pArgs->sourceLimit=sourceLimit;
1171 state=8;
1172 break;
1173 }
1174 }
1175
1176 cnv->mode=state;
1177 }
1178
1179 static UChar32
_UTF32GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1180 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1181 UErrorCode *pErrorCode) {
1182 switch(pArgs->converter->mode) {
1183 case 8:
1184 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1185 case 9:
1186 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1187 default:
1188 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1189 }
1190 }
1191
1192 static const UConverterImpl _UTF32Impl = {
1193 UCNV_UTF32,
1194
1195 NULL,
1196 NULL,
1197
1198 _UTF32Open,
1199 NULL,
1200 _UTF32Reset,
1201
1202 _UTF32ToUnicodeWithOffsets,
1203 _UTF32ToUnicodeWithOffsets,
1204 #if U_IS_BIG_ENDIAN
1205 T_UConverter_fromUnicode_UTF32_BE,
1206 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1207 #else
1208 T_UConverter_fromUnicode_UTF32_LE,
1209 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1210 #endif
1211 _UTF32GetNextUChar,
1212
1213 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1214 NULL,
1215 NULL,
1216 NULL,
1217 ucnv_getNonSurrogateUnicodeSet
1218 };
1219
1220 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1221 static const UConverterStaticData _UTF32StaticData = {
1222 sizeof(UConverterStaticData),
1223 "UTF-32",
1224 1236,
1225 UCNV_IBM, UCNV_UTF32, 4, 4,
1226 #if U_IS_BIG_ENDIAN
1227 { 0, 0, 0xff, 0xfd }, 4,
1228 #else
1229 { 0xfd, 0xff, 0, 0 }, 4,
1230 #endif
1231 FALSE, FALSE,
1232 0,
1233 0,
1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1235 };
1236
1237 const UConverterSharedData _UTF32Data =
1238 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1239
1240 #endif
1241