• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u32.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_CONVERSION
20 
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25 
26 #define MAXIMUM_UCS2            0x0000FFFF
27 #define MAXIMUM_UTF             0x0010FFFF
28 #define HALF_SHIFT              10
29 #define HALF_BASE               0x0010000
30 #define HALF_MASK               0x3FF
31 #define SURROGATE_HIGH_START    0xD800
32 #define SURROGATE_LOW_START     0xDC00
33 
34 /* -SURROGATE_LOW_START + HALF_BASE */
35 #define SURROGATE_LOW_BASE      9216
36 
37 enum {
38     UCNV_NEED_TO_WRITE_BOM=1
39 };
40 
41 /* UTF-32BE ----------------------------------------------------------------- */
42 
43 static void
T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
45                                 UErrorCode * err)
46 {
47     const unsigned char *mySource = (unsigned char *) args->source;
48     UChar *myTarget = args->target;
49     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
50     const UChar *targetLimit = args->targetLimit;
51     unsigned char *toUBytes = args->converter->toUBytes;
52     uint32_t ch, i;
53 
54     /* Restore state of current sequence */
55     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
56         i = args->converter->toULength;       /* restore # of bytes consumed */
57         args->converter->toULength = 0;
58 
59         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
60         args->converter->toUnicodeStatus = 0;
61         goto morebytes;
62     }
63 
64     while (mySource < sourceLimit && myTarget < targetLimit) {
65         i = 0;
66         ch = 0;
67 morebytes:
68         while (i < sizeof(uint32_t)) {
69             if (mySource < sourceLimit) {
70                 ch = (ch << 8) | (uint8_t)(*mySource);
71                 toUBytes[i++] = (char) *(mySource++);
72             }
73             else {
74                 /* stores a partially calculated target*/
75                 /* + 1 to make 0 a valid character */
76                 args->converter->toUnicodeStatus = ch + 1;
77                 args->converter->toULength = (int8_t) i;
78                 goto donefornow;
79             }
80         }
81 
82         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
83             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
84             if (ch <= MAXIMUM_UCS2)
85             {
86                 /* fits in 16 bits */
87                 *(myTarget++) = (UChar) ch;
88             }
89             else {
90                 /* write out the surrogates */
91                 *(myTarget++) = U16_LEAD(ch);
92                 ch = U16_TRAIL(ch);
93                 if (myTarget < targetLimit) {
94                     *(myTarget++) = (UChar)ch;
95                 }
96                 else {
97                     /* Put in overflow buffer (not handled here) */
98                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
99                     args->converter->UCharErrorBufferLength = 1;
100                     *err = U_BUFFER_OVERFLOW_ERROR;
101                     break;
102                 }
103             }
104         }
105         else {
106             args->converter->toULength = (int8_t)i;
107             *err = U_ILLEGAL_CHAR_FOUND;
108             break;
109         }
110     }
111 
112 donefornow:
113     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
114         /* End of target buffer */
115         *err = U_BUFFER_OVERFLOW_ERROR;
116     }
117 
118     args->target = myTarget;
119     args->source = (const char *) mySource;
120 }
121 
122 static void
T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
124                                              UErrorCode * err)
125 {
126     const unsigned char *mySource = (unsigned char *) args->source;
127     UChar *myTarget = args->target;
128     int32_t *myOffsets = args->offsets;
129     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
130     const UChar *targetLimit = args->targetLimit;
131     unsigned char *toUBytes = args->converter->toUBytes;
132     uint32_t ch, i;
133     int32_t offsetNum = 0;
134 
135     /* Restore state of current sequence */
136     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
137         i = args->converter->toULength;       /* restore # of bytes consumed */
138         args->converter->toULength = 0;
139 
140         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
141         args->converter->toUnicodeStatus = 0;
142         goto morebytes;
143     }
144 
145     while (mySource < sourceLimit && myTarget < targetLimit) {
146         i = 0;
147         ch = 0;
148 morebytes:
149         while (i < sizeof(uint32_t)) {
150             if (mySource < sourceLimit) {
151                 ch = (ch << 8) | (uint8_t)(*mySource);
152                 toUBytes[i++] = (char) *(mySource++);
153             }
154             else {
155                 /* stores a partially calculated target*/
156                 /* + 1 to make 0 a valid character */
157                 args->converter->toUnicodeStatus = ch + 1;
158                 args->converter->toULength = (int8_t) i;
159                 goto donefornow;
160             }
161         }
162 
163         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
164             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
165             if (ch <= MAXIMUM_UCS2) {
166                 /* fits in 16 bits */
167                 *(myTarget++) = (UChar) ch;
168                 *(myOffsets++) = offsetNum;
169             }
170             else {
171                 /* write out the surrogates */
172                 *(myTarget++) = U16_LEAD(ch);
173                 *myOffsets++ = offsetNum;
174                 ch = U16_TRAIL(ch);
175                 if (myTarget < targetLimit)
176                 {
177                     *(myTarget++) = (UChar)ch;
178                     *(myOffsets++) = offsetNum;
179                 }
180                 else {
181                     /* Put in overflow buffer (not handled here) */
182                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
183                     args->converter->UCharErrorBufferLength = 1;
184                     *err = U_BUFFER_OVERFLOW_ERROR;
185                     break;
186                 }
187             }
188         }
189         else {
190             args->converter->toULength = (int8_t)i;
191             *err = U_ILLEGAL_CHAR_FOUND;
192             break;
193         }
194         offsetNum += i;
195     }
196 
197 donefornow:
198     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
199     {
200         /* End of target buffer */
201         *err = U_BUFFER_OVERFLOW_ERROR;
202     }
203 
204     args->target = myTarget;
205     args->source = (const char *) mySource;
206     args->offsets = myOffsets;
207 }
208 
209 static void
T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,UErrorCode * err)210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
211                                   UErrorCode * err)
212 {
213     const UChar *mySource = args->source;
214     unsigned char *myTarget;
215     const UChar *sourceLimit = args->sourceLimit;
216     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
217     UChar32 ch, ch2;
218     unsigned int indexToWrite;
219     unsigned char temp[sizeof(uint32_t)];
220 
221     if(mySource >= sourceLimit) {
222         /* no input, nothing to do */
223         return;
224     }
225 
226     /* write the BOM if necessary */
227     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
228         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
229         ucnv_fromUWriteBytes(args->converter,
230                              bom, 4,
231                              &args->target, args->targetLimit,
232                              &args->offsets, -1,
233                              err);
234         args->converter->fromUnicodeStatus=0;
235     }
236 
237     myTarget = (unsigned char *) args->target;
238     temp[0] = 0;
239 
240     if (args->converter->fromUChar32) {
241         ch = args->converter->fromUChar32;
242         args->converter->fromUChar32 = 0;
243         goto lowsurogate;
244     }
245 
246     while (mySource < sourceLimit && myTarget < targetLimit) {
247         ch = *(mySource++);
248 
249         if (UTF_IS_SURROGATE(ch)) {
250             if (U_IS_LEAD(ch)) {
251 lowsurogate:
252                 if (mySource < sourceLimit) {
253                     ch2 = *mySource;
254                     if (U_IS_TRAIL(ch2)) {
255                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
256                         mySource++;
257                     }
258                     else {
259                         /* this is an unmatched trail code unit (2nd surrogate) */
260                         /* callback(illegal) */
261                         args->converter->fromUChar32 = ch;
262                         *err = U_ILLEGAL_CHAR_FOUND;
263                         break;
264                     }
265                 }
266                 else {
267                     /* ran out of source */
268                     args->converter->fromUChar32 = ch;
269                     if (args->flush) {
270                         /* this is an unmatched trail code unit (2nd surrogate) */
271                         /* callback(illegal) */
272                         *err = U_ILLEGAL_CHAR_FOUND;
273                     }
274                     break;
275                 }
276             }
277             else {
278                 /* this is an unmatched trail code unit (2nd surrogate) */
279                 /* callback(illegal) */
280                 args->converter->fromUChar32 = ch;
281                 *err = U_ILLEGAL_CHAR_FOUND;
282                 break;
283             }
284         }
285 
286         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
287         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
288         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
289         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
290 
291         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
292             if (myTarget < targetLimit) {
293                 *(myTarget++) = temp[indexToWrite];
294             }
295             else {
296                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
297                 *err = U_BUFFER_OVERFLOW_ERROR;
298             }
299         }
300     }
301 
302     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
303         *err = U_BUFFER_OVERFLOW_ERROR;
304     }
305 
306     args->target = (char *) myTarget;
307     args->source = mySource;
308 }
309 
310 static void
T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
312                                                UErrorCode * err)
313 {
314     const UChar *mySource = args->source;
315     unsigned char *myTarget;
316     int32_t *myOffsets;
317     const UChar *sourceLimit = args->sourceLimit;
318     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
319     UChar32 ch, ch2;
320     int32_t offsetNum = 0;
321     unsigned int indexToWrite;
322     unsigned char temp[sizeof(uint32_t)];
323 
324     if(mySource >= sourceLimit) {
325         /* no input, nothing to do */
326         return;
327     }
328 
329     /* write the BOM if necessary */
330     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
331         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
332         ucnv_fromUWriteBytes(args->converter,
333                              bom, 4,
334                              &args->target, args->targetLimit,
335                              &args->offsets, -1,
336                              err);
337         args->converter->fromUnicodeStatus=0;
338     }
339 
340     myTarget = (unsigned char *) args->target;
341     myOffsets = args->offsets;
342     temp[0] = 0;
343 
344     if (args->converter->fromUChar32) {
345         ch = args->converter->fromUChar32;
346         args->converter->fromUChar32 = 0;
347         goto lowsurogate;
348     }
349 
350     while (mySource < sourceLimit && myTarget < targetLimit) {
351         ch = *(mySource++);
352 
353         if (UTF_IS_SURROGATE(ch)) {
354             if (U_IS_LEAD(ch)) {
355 lowsurogate:
356                 if (mySource < sourceLimit) {
357                     ch2 = *mySource;
358                     if (U_IS_TRAIL(ch2)) {
359                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
360                         mySource++;
361                     }
362                     else {
363                         /* this is an unmatched trail code unit (2nd surrogate) */
364                         /* callback(illegal) */
365                         args->converter->fromUChar32 = ch;
366                         *err = U_ILLEGAL_CHAR_FOUND;
367                         break;
368                     }
369                 }
370                 else {
371                     /* ran out of source */
372                     args->converter->fromUChar32 = ch;
373                     if (args->flush) {
374                         /* this is an unmatched trail code unit (2nd surrogate) */
375                         /* callback(illegal) */
376                         *err = U_ILLEGAL_CHAR_FOUND;
377                     }
378                     break;
379                 }
380             }
381             else {
382                 /* this is an unmatched trail code unit (2nd surrogate) */
383                 /* callback(illegal) */
384                 args->converter->fromUChar32 = ch;
385                 *err = U_ILLEGAL_CHAR_FOUND;
386                 break;
387             }
388         }
389 
390         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
391         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
392         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
393         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
394 
395         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
396             if (myTarget < targetLimit) {
397                 *(myTarget++) = temp[indexToWrite];
398                 *(myOffsets++) = offsetNum;
399             }
400             else {
401                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
402                 *err = U_BUFFER_OVERFLOW_ERROR;
403             }
404         }
405         offsetNum = offsetNum + 1 + (temp[1] != 0);
406     }
407 
408     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
409         *err = U_BUFFER_OVERFLOW_ERROR;
410     }
411 
412     args->target = (char *) myTarget;
413     args->source = mySource;
414     args->offsets = myOffsets;
415 }
416 
417 static UChar32
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
419                                    UErrorCode* err)
420 {
421     const uint8_t *mySource;
422     UChar32 myUChar;
423     int32_t length;
424 
425     mySource = (const uint8_t *)args->source;
426     if (mySource >= (const uint8_t *)args->sourceLimit)
427     {
428         /* no input */
429         *err = U_INDEX_OUTOFBOUNDS_ERROR;
430         return 0xffff;
431     }
432 
433     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
434     if (length < 4)
435     {
436         /* got a partial character */
437         uprv_memcpy(args->converter->toUBytes, mySource, length);
438         args->converter->toULength = (int8_t)length;
439         args->source = (const char *)(mySource + length);
440         *err = U_TRUNCATED_CHAR_FOUND;
441         return 0xffff;
442     }
443 
444     /* Don't even try to do a direct cast because the value may be on an odd address. */
445     myUChar = ((UChar32)mySource[0] << 24)
446             | ((UChar32)mySource[1] << 16)
447             | ((UChar32)mySource[2] << 8)
448             | ((UChar32)mySource[3]);
449 
450     args->source = (const char *)(mySource + 4);
451     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
452         return myUChar;
453     }
454 
455     uprv_memcpy(args->converter->toUBytes, mySource, 4);
456     args->converter->toULength = 4;
457 
458     *err = U_ILLEGAL_CHAR_FOUND;
459     return 0xffff;
460 }
461 
462 static const UConverterImpl _UTF32BEImpl = {
463     UCNV_UTF32_BigEndian,
464 
465     NULL,
466     NULL,
467 
468     NULL,
469     NULL,
470     NULL,
471 
472     T_UConverter_toUnicode_UTF32_BE,
473     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
474     T_UConverter_fromUnicode_UTF32_BE,
475     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
476     T_UConverter_getNextUChar_UTF32_BE,
477 
478     NULL,
479     NULL,
480     NULL,
481     NULL,
482     ucnv_getNonSurrogateUnicodeSet
483 };
484 
485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
486 static const UConverterStaticData _UTF32BEStaticData = {
487     sizeof(UConverterStaticData),
488     "UTF-32BE",
489     1232,
490     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
491     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
492     0,
493     0,
494     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
495 };
496 
497 const UConverterSharedData _UTF32BEData = {
498     sizeof(UConverterSharedData), ~((uint32_t) 0),
499     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
500     0
501 };
502 
503 /* UTF-32LE ---------------------------------------------------------- */
504 
505 static void
T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
507                                 UErrorCode * err)
508 {
509     const unsigned char *mySource = (unsigned char *) args->source;
510     UChar *myTarget = args->target;
511     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
512     const UChar *targetLimit = args->targetLimit;
513     unsigned char *toUBytes = args->converter->toUBytes;
514     uint32_t ch, i;
515 
516     /* Restore state of current sequence */
517     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
518     {
519         i = args->converter->toULength;       /* restore # of bytes consumed */
520         args->converter->toULength = 0;
521 
522         /* Stores the previously calculated ch from a previous call*/
523         ch = args->converter->toUnicodeStatus - 1;
524         args->converter->toUnicodeStatus = 0;
525         goto morebytes;
526     }
527 
528     while (mySource < sourceLimit && myTarget < targetLimit)
529     {
530         i = 0;
531         ch = 0;
532 morebytes:
533         while (i < sizeof(uint32_t))
534         {
535             if (mySource < sourceLimit)
536             {
537                 ch |= ((uint8_t)(*mySource)) << (i * 8);
538                 toUBytes[i++] = (char) *(mySource++);
539             }
540             else
541             {
542                 /* stores a partially calculated target*/
543                 /* + 1 to make 0 a valid character */
544                 args->converter->toUnicodeStatus = ch + 1;
545                 args->converter->toULength = (int8_t) i;
546                 goto donefornow;
547             }
548         }
549 
550         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
551             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
552             if (ch <= MAXIMUM_UCS2) {
553                 /* fits in 16 bits */
554                 *(myTarget++) = (UChar) ch;
555             }
556             else {
557                 /* write out the surrogates */
558                 *(myTarget++) = U16_LEAD(ch);
559                 ch = U16_TRAIL(ch);
560                 if (myTarget < targetLimit) {
561                     *(myTarget++) = (UChar)ch;
562                 }
563                 else {
564                     /* Put in overflow buffer (not handled here) */
565                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
566                     args->converter->UCharErrorBufferLength = 1;
567                     *err = U_BUFFER_OVERFLOW_ERROR;
568                     break;
569                 }
570             }
571         }
572         else {
573             args->converter->toULength = (int8_t)i;
574             *err = U_ILLEGAL_CHAR_FOUND;
575             break;
576         }
577     }
578 
579 donefornow:
580     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
581     {
582         /* End of target buffer */
583         *err = U_BUFFER_OVERFLOW_ERROR;
584     }
585 
586     args->target = myTarget;
587     args->source = (const char *) mySource;
588 }
589 
590 static void
T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
592                                              UErrorCode * err)
593 {
594     const unsigned char *mySource = (unsigned char *) args->source;
595     UChar *myTarget = args->target;
596     int32_t *myOffsets = args->offsets;
597     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
598     const UChar *targetLimit = args->targetLimit;
599     unsigned char *toUBytes = args->converter->toUBytes;
600     uint32_t ch, i;
601     int32_t offsetNum = 0;
602 
603     /* Restore state of current sequence */
604     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
605     {
606         i = args->converter->toULength;       /* restore # of bytes consumed */
607         args->converter->toULength = 0;
608 
609         /* Stores the previously calculated ch from a previous call*/
610         ch = args->converter->toUnicodeStatus - 1;
611         args->converter->toUnicodeStatus = 0;
612         goto morebytes;
613     }
614 
615     while (mySource < sourceLimit && myTarget < targetLimit)
616     {
617         i = 0;
618         ch = 0;
619 morebytes:
620         while (i < sizeof(uint32_t))
621         {
622             if (mySource < sourceLimit)
623             {
624                 ch |= ((uint8_t)(*mySource)) << (i * 8);
625                 toUBytes[i++] = (char) *(mySource++);
626             }
627             else
628             {
629                 /* stores a partially calculated target*/
630                 /* + 1 to make 0 a valid character */
631                 args->converter->toUnicodeStatus = ch + 1;
632                 args->converter->toULength = (int8_t) i;
633                 goto donefornow;
634             }
635         }
636 
637         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
638         {
639             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
640             if (ch <= MAXIMUM_UCS2)
641             {
642                 /* fits in 16 bits */
643                 *(myTarget++) = (UChar) ch;
644                 *(myOffsets++) = offsetNum;
645             }
646             else {
647                 /* write out the surrogates */
648                 *(myTarget++) = U16_LEAD(ch);
649                 *(myOffsets++) = offsetNum;
650                 ch = U16_TRAIL(ch);
651                 if (myTarget < targetLimit)
652                 {
653                     *(myTarget++) = (UChar)ch;
654                     *(myOffsets++) = offsetNum;
655                 }
656                 else
657                 {
658                     /* Put in overflow buffer (not handled here) */
659                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
660                     args->converter->UCharErrorBufferLength = 1;
661                     *err = U_BUFFER_OVERFLOW_ERROR;
662                     break;
663                 }
664             }
665         }
666         else
667         {
668             args->converter->toULength = (int8_t)i;
669             *err = U_ILLEGAL_CHAR_FOUND;
670             break;
671         }
672         offsetNum += i;
673     }
674 
675 donefornow:
676     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
677     {
678         /* End of target buffer */
679         *err = U_BUFFER_OVERFLOW_ERROR;
680     }
681 
682     args->target = myTarget;
683     args->source = (const char *) mySource;
684     args->offsets = myOffsets;
685 }
686 
687 static void
T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,UErrorCode * err)688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
689                                   UErrorCode * err)
690 {
691     const UChar *mySource = args->source;
692     unsigned char *myTarget;
693     const UChar *sourceLimit = args->sourceLimit;
694     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
695     UChar32 ch, ch2;
696     unsigned int indexToWrite;
697     unsigned char temp[sizeof(uint32_t)];
698 
699     if(mySource >= sourceLimit) {
700         /* no input, nothing to do */
701         return;
702     }
703 
704     /* write the BOM if necessary */
705     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
706         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
707         ucnv_fromUWriteBytes(args->converter,
708                              bom, 4,
709                              &args->target, args->targetLimit,
710                              &args->offsets, -1,
711                              err);
712         args->converter->fromUnicodeStatus=0;
713     }
714 
715     myTarget = (unsigned char *) args->target;
716     temp[3] = 0;
717 
718     if (args->converter->fromUChar32)
719     {
720         ch = args->converter->fromUChar32;
721         args->converter->fromUChar32 = 0;
722         goto lowsurogate;
723     }
724 
725     while (mySource < sourceLimit && myTarget < targetLimit)
726     {
727         ch = *(mySource++);
728 
729         if (UTF_IS_SURROGATE(ch)) {
730             if (U_IS_LEAD(ch))
731             {
732 lowsurogate:
733                 if (mySource < sourceLimit)
734                 {
735                     ch2 = *mySource;
736                     if (U_IS_TRAIL(ch2)) {
737                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
738                         mySource++;
739                     }
740                     else {
741                         /* this is an unmatched trail code unit (2nd surrogate) */
742                         /* callback(illegal) */
743                         args->converter->fromUChar32 = ch;
744                         *err = U_ILLEGAL_CHAR_FOUND;
745                         break;
746                     }
747                 }
748                 else {
749                     /* ran out of source */
750                     args->converter->fromUChar32 = ch;
751                     if (args->flush) {
752                         /* this is an unmatched trail code unit (2nd surrogate) */
753                         /* callback(illegal) */
754                         *err = U_ILLEGAL_CHAR_FOUND;
755                     }
756                     break;
757                 }
758             }
759             else {
760                 /* this is an unmatched trail code unit (2nd surrogate) */
761                 /* callback(illegal) */
762                 args->converter->fromUChar32 = ch;
763                 *err = U_ILLEGAL_CHAR_FOUND;
764                 break;
765             }
766         }
767 
768         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
769         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
770         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
771         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
772 
773         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
774         {
775             if (myTarget < targetLimit)
776             {
777                 *(myTarget++) = temp[indexToWrite];
778             }
779             else
780             {
781                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
782                 *err = U_BUFFER_OVERFLOW_ERROR;
783             }
784         }
785     }
786 
787     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
788     {
789         *err = U_BUFFER_OVERFLOW_ERROR;
790     }
791 
792     args->target = (char *) myTarget;
793     args->source = mySource;
794 }
795 
796 static void
T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
798                                                UErrorCode * err)
799 {
800     const UChar *mySource = args->source;
801     unsigned char *myTarget;
802     int32_t *myOffsets;
803     const UChar *sourceLimit = args->sourceLimit;
804     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
805     UChar32 ch, ch2;
806     unsigned int indexToWrite;
807     unsigned char temp[sizeof(uint32_t)];
808     int32_t offsetNum = 0;
809 
810     if(mySource >= sourceLimit) {
811         /* no input, nothing to do */
812         return;
813     }
814 
815     /* write the BOM if necessary */
816     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
817         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
818         ucnv_fromUWriteBytes(args->converter,
819                              bom, 4,
820                              &args->target, args->targetLimit,
821                              &args->offsets, -1,
822                              err);
823         args->converter->fromUnicodeStatus=0;
824     }
825 
826     myTarget = (unsigned char *) args->target;
827     myOffsets = args->offsets;
828     temp[3] = 0;
829 
830     if (args->converter->fromUChar32)
831     {
832         ch = args->converter->fromUChar32;
833         args->converter->fromUChar32 = 0;
834         goto lowsurogate;
835     }
836 
837     while (mySource < sourceLimit && myTarget < targetLimit)
838     {
839         ch = *(mySource++);
840 
841         if (UTF_IS_SURROGATE(ch)) {
842             if (U_IS_LEAD(ch))
843             {
844 lowsurogate:
845                 if (mySource < sourceLimit)
846                 {
847                     ch2 = *mySource;
848                     if (U_IS_TRAIL(ch2))
849                     {
850                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
851                         mySource++;
852                     }
853                     else {
854                         /* this is an unmatched trail code unit (2nd surrogate) */
855                         /* callback(illegal) */
856                         args->converter->fromUChar32 = ch;
857                         *err = U_ILLEGAL_CHAR_FOUND;
858                         break;
859                     }
860                 }
861                 else {
862                     /* ran out of source */
863                     args->converter->fromUChar32 = ch;
864                     if (args->flush) {
865                         /* this is an unmatched trail code unit (2nd surrogate) */
866                         /* callback(illegal) */
867                         *err = U_ILLEGAL_CHAR_FOUND;
868                     }
869                     break;
870                 }
871             }
872             else {
873                 /* this is an unmatched trail code unit (2nd surrogate) */
874                 /* callback(illegal) */
875                 args->converter->fromUChar32 = ch;
876                 *err = U_ILLEGAL_CHAR_FOUND;
877                 break;
878             }
879         }
880 
881         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
882         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
883         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
884         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
885 
886         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
887         {
888             if (myTarget < targetLimit)
889             {
890                 *(myTarget++) = temp[indexToWrite];
891                 *(myOffsets++) = offsetNum;
892             }
893             else
894             {
895                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
896                 *err = U_BUFFER_OVERFLOW_ERROR;
897             }
898         }
899         offsetNum = offsetNum + 1 + (temp[2] != 0);
900     }
901 
902     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
903     {
904         *err = U_BUFFER_OVERFLOW_ERROR;
905     }
906 
907     args->target = (char *) myTarget;
908     args->source = mySource;
909     args->offsets = myOffsets;
910 }
911 
912 static UChar32
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
914                                    UErrorCode* err)
915 {
916     const uint8_t *mySource;
917     UChar32 myUChar;
918     int32_t length;
919 
920     mySource = (const uint8_t *)args->source;
921     if (mySource >= (const uint8_t *)args->sourceLimit)
922     {
923         /* no input */
924         *err = U_INDEX_OUTOFBOUNDS_ERROR;
925         return 0xffff;
926     }
927 
928     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
929     if (length < 4)
930     {
931         /* got a partial character */
932         uprv_memcpy(args->converter->toUBytes, mySource, length);
933         args->converter->toULength = (int8_t)length;
934         args->source = (const char *)(mySource + length);
935         *err = U_TRUNCATED_CHAR_FOUND;
936         return 0xffff;
937     }
938 
939     /* Don't even try to do a direct cast because the value may be on an odd address. */
940     myUChar = ((UChar32)mySource[3] << 24)
941             | ((UChar32)mySource[2] << 16)
942             | ((UChar32)mySource[1] << 8)
943             | ((UChar32)mySource[0]);
944 
945     args->source = (const char *)(mySource + 4);
946     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
947         return myUChar;
948     }
949 
950     uprv_memcpy(args->converter->toUBytes, mySource, 4);
951     args->converter->toULength = 4;
952 
953     *err = U_ILLEGAL_CHAR_FOUND;
954     return 0xffff;
955 }
956 
957 static const UConverterImpl _UTF32LEImpl = {
958     UCNV_UTF32_LittleEndian,
959 
960     NULL,
961     NULL,
962 
963     NULL,
964     NULL,
965     NULL,
966 
967     T_UConverter_toUnicode_UTF32_LE,
968     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
969     T_UConverter_fromUnicode_UTF32_LE,
970     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
971     T_UConverter_getNextUChar_UTF32_LE,
972 
973     NULL,
974     NULL,
975     NULL,
976     NULL,
977     ucnv_getNonSurrogateUnicodeSet
978 };
979 
980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
981 static const UConverterStaticData _UTF32LEStaticData = {
982     sizeof(UConverterStaticData),
983     "UTF-32LE",
984     1234,
985     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
986     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
987     0,
988     0,
989     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
990 };
991 
992 
993 const UConverterSharedData _UTF32LEData = {
994     sizeof(UConverterSharedData), ~((uint32_t) 0),
995     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
996     0
997 };
998 
999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1000 
1001 /*
1002  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1003  * accordingly.
1004  *
1005  * State values:
1006  * 0    initial state
1007  * 1    saw 00
1008  * 2    saw 00 00
1009  * 3    saw 00 00 FE
1010  * 4    -
1011  * 5    saw FF
1012  * 6    saw FF FE
1013  * 7    saw FF FE 00
1014  * 8    UTF-32BE mode
1015  * 9    UTF-32LE mode
1016  *
1017  * During detection: state&3==number of matching bytes so far.
1018  *
1019  * On output, emit U+FEFF as the first code point.
1020  */
1021 
1022 static void
_UTF32Reset(UConverter * cnv,UConverterResetChoice choice)1023 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1024     if(choice<=UCNV_RESET_TO_UNICODE) {
1025         /* reset toUnicode: state=0 */
1026         cnv->mode=0;
1027     }
1028     if(choice!=UCNV_RESET_TO_UNICODE) {
1029         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1030         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1031     }
1032 }
1033 
1034 static void
_UTF32Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1035 _UTF32Open(UConverter *cnv,
1036            UConverterLoadArgs *pArgs,
1037            UErrorCode *pErrorCode) {
1038     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1039 }
1040 
1041 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1042 
1043 static void
_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1044 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1045                            UErrorCode *pErrorCode) {
1046     UConverter *cnv=pArgs->converter;
1047     const char *source=pArgs->source;
1048     const char *sourceLimit=pArgs->sourceLimit;
1049     int32_t *offsets=pArgs->offsets;
1050 
1051     int32_t state, offsetDelta;
1052     char b;
1053 
1054     state=cnv->mode;
1055 
1056     /*
1057      * If we detect a BOM in this buffer, then we must add the BOM size to the
1058      * offsets because the actual converter function will not see and count the BOM.
1059      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1060      */
1061     offsetDelta=0;
1062 
1063     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1064         switch(state) {
1065         case 0:
1066             b=*source;
1067             if(b==0) {
1068                 state=1; /* could be 00 00 FE FF */
1069             } else if(b==(char)0xff) {
1070                 state=5; /* could be FF FE 00 00 */
1071             } else {
1072                 state=8; /* default to UTF-32BE */
1073                 continue;
1074             }
1075             ++source;
1076             break;
1077         case 1:
1078         case 2:
1079         case 3:
1080         case 5:
1081         case 6:
1082         case 7:
1083             if(*source==utf32BOM[state]) {
1084                 ++state;
1085                 ++source;
1086                 if(state==4) {
1087                     state=8; /* detect UTF-32BE */
1088                     offsetDelta=(int32_t)(source-pArgs->source);
1089                 } else if(state==8) {
1090                     state=9; /* detect UTF-32LE */
1091                     offsetDelta=(int32_t)(source-pArgs->source);
1092                 }
1093             } else {
1094                 /* switch to UTF-32BE and pass the previous bytes */
1095                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1096 
1097                 /* reset the source */
1098                 source=pArgs->source;
1099 
1100                 if(count==(state&3)) {
1101                     /* simple: all in the same buffer, just reset source */
1102                 } else {
1103                     UBool oldFlush=pArgs->flush;
1104 
1105                     /* some of the bytes are from a previous buffer, replay those first */
1106                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1107                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1108                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1109 
1110                     /* no offsets: bytes from previous buffer, and not enough for output */
1111                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1112 
1113                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1114                     pArgs->sourceLimit=sourceLimit;
1115                     pArgs->flush=oldFlush;
1116                 }
1117                 state=8;
1118                 continue;
1119             }
1120             break;
1121         case 8:
1122             /* call UTF-32BE */
1123             pArgs->source=source;
1124             if(offsets==NULL) {
1125                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1126             } else {
1127                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1128             }
1129             source=pArgs->source;
1130             break;
1131         case 9:
1132             /* call UTF-32LE */
1133             pArgs->source=source;
1134             if(offsets==NULL) {
1135                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1136             } else {
1137                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1138             }
1139             source=pArgs->source;
1140             break;
1141         default:
1142             break; /* does not occur */
1143         }
1144     }
1145 
1146     /* add BOM size to offsets - see comment at offsetDelta declaration */
1147     if(offsets!=NULL && offsetDelta!=0) {
1148         int32_t *offsetsLimit=pArgs->offsets;
1149         while(offsets<offsetsLimit) {
1150             *offsets++ += offsetDelta;
1151         }
1152     }
1153 
1154     pArgs->source=source;
1155 
1156     if(source==sourceLimit && pArgs->flush) {
1157         /* handle truncated input */
1158         switch(state) {
1159         case 0:
1160             break; /* no input at all, nothing to do */
1161         case 8:
1162             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1163             break;
1164         case 9:
1165             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1166             break;
1167         default:
1168             /* handle 0<state<8: call UTF-32BE with too-short input */
1169             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1170             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1171 
1172             /* no offsets: not enough for output */
1173             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1174             pArgs->source=source;
1175             pArgs->sourceLimit=sourceLimit;
1176             state=8;
1177             break;
1178         }
1179     }
1180 
1181     cnv->mode=state;
1182 }
1183 
1184 static UChar32
_UTF32GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1185 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1186                    UErrorCode *pErrorCode) {
1187     switch(pArgs->converter->mode) {
1188     case 8:
1189         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1190     case 9:
1191         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1192     default:
1193         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1194     }
1195 }
1196 
1197 static const UConverterImpl _UTF32Impl = {
1198     UCNV_UTF32,
1199 
1200     NULL,
1201     NULL,
1202 
1203     _UTF32Open,
1204     NULL,
1205     _UTF32Reset,
1206 
1207     _UTF32ToUnicodeWithOffsets,
1208     _UTF32ToUnicodeWithOffsets,
1209 #if U_IS_BIG_ENDIAN
1210     T_UConverter_fromUnicode_UTF32_BE,
1211     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1212 #else
1213     T_UConverter_fromUnicode_UTF32_LE,
1214     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1215 #endif
1216     _UTF32GetNextUChar,
1217 
1218     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1219     NULL,
1220     NULL,
1221     NULL,
1222     ucnv_getNonSurrogateUnicodeSet
1223 };
1224 
1225 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1226 static const UConverterStaticData _UTF32StaticData = {
1227     sizeof(UConverterStaticData),
1228     "UTF-32",
1229     1236,
1230     UCNV_IBM, UCNV_UTF32, 4, 4,
1231 #if U_IS_BIG_ENDIAN
1232     { 0, 0, 0xff, 0xfd }, 4,
1233 #else
1234     { 0xfd, 0xff, 0, 0 }, 4,
1235 #endif
1236     FALSE, FALSE,
1237     0,
1238     0,
1239     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1240 };
1241 
1242 const UConverterSharedData _UTF32Data = {
1243     sizeof(UConverterSharedData), ~((uint32_t) 0),
1244     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1245     0
1246 };
1247 
1248 #endif
1249