• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u32.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
20 
21 #include "unicode/ucnv.h"
22 #include "unicode/utf.h"
23 #include "ucnv_bld.h"
24 #include "ucnv_cnv.h"
25 #include "cmemory.h"
26 
27 #define MAXIMUM_UCS2            0x0000FFFF
28 #define MAXIMUM_UTF             0x0010FFFF
29 #define HALF_SHIFT              10
30 #define HALF_BASE               0x0010000
31 #define HALF_MASK               0x3FF
32 #define SURROGATE_HIGH_START    0xD800
33 #define SURROGATE_LOW_START     0xDC00
34 
35 /* -SURROGATE_LOW_START + HALF_BASE */
36 #define SURROGATE_LOW_BASE      9216
37 
38 enum {
39     UCNV_NEED_TO_WRITE_BOM=1
40 };
41 
42 /* UTF-32BE ----------------------------------------------------------------- */
43 
44 static void
T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
46                                 UErrorCode * err)
47 {
48     const unsigned char *mySource = (unsigned char *) args->source;
49     UChar *myTarget = args->target;
50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
51     const UChar *targetLimit = args->targetLimit;
52     unsigned char *toUBytes = args->converter->toUBytes;
53     uint32_t ch, i;
54 
55     /* Restore state of current sequence */
56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
57         i = args->converter->toULength;       /* restore # of bytes consumed */
58         args->converter->toULength = 0;
59 
60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
61         args->converter->toUnicodeStatus = 0;
62         goto morebytes;
63     }
64 
65     while (mySource < sourceLimit && myTarget < targetLimit) {
66         i = 0;
67         ch = 0;
68 morebytes:
69         while (i < sizeof(uint32_t)) {
70             if (mySource < sourceLimit) {
71                 ch = (ch << 8) | (uint8_t)(*mySource);
72                 toUBytes[i++] = (char) *(mySource++);
73             }
74             else {
75                 /* stores a partially calculated target*/
76                 /* + 1 to make 0 a valid character */
77                 args->converter->toUnicodeStatus = ch + 1;
78                 args->converter->toULength = (int8_t) i;
79                 goto donefornow;
80             }
81         }
82 
83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85             if (ch <= MAXIMUM_UCS2)
86             {
87                 /* fits in 16 bits */
88                 *(myTarget++) = (UChar) ch;
89             }
90             else {
91                 /* write out the surrogates */
92                 *(myTarget++) = U16_LEAD(ch);
93                 ch = U16_TRAIL(ch);
94                 if (myTarget < targetLimit) {
95                     *(myTarget++) = (UChar)ch;
96                 }
97                 else {
98                     /* Put in overflow buffer (not handled here) */
99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
100                     args->converter->UCharErrorBufferLength = 1;
101                     *err = U_BUFFER_OVERFLOW_ERROR;
102                     break;
103                 }
104             }
105         }
106         else {
107             args->converter->toULength = (int8_t)i;
108             *err = U_ILLEGAL_CHAR_FOUND;
109             break;
110         }
111     }
112 
113 donefornow:
114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
115         /* End of target buffer */
116         *err = U_BUFFER_OVERFLOW_ERROR;
117     }
118 
119     args->target = myTarget;
120     args->source = (const char *) mySource;
121 }
122 
123 static void
T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
125                                              UErrorCode * err)
126 {
127     const unsigned char *mySource = (unsigned char *) args->source;
128     UChar *myTarget = args->target;
129     int32_t *myOffsets = args->offsets;
130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
131     const UChar *targetLimit = args->targetLimit;
132     unsigned char *toUBytes = args->converter->toUBytes;
133     uint32_t ch, i;
134     int32_t offsetNum = 0;
135 
136     /* Restore state of current sequence */
137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
138         i = args->converter->toULength;       /* restore # of bytes consumed */
139         args->converter->toULength = 0;
140 
141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
142         args->converter->toUnicodeStatus = 0;
143         goto morebytes;
144     }
145 
146     while (mySource < sourceLimit && myTarget < targetLimit) {
147         i = 0;
148         ch = 0;
149 morebytes:
150         while (i < sizeof(uint32_t)) {
151             if (mySource < sourceLimit) {
152                 ch = (ch << 8) | (uint8_t)(*mySource);
153                 toUBytes[i++] = (char) *(mySource++);
154             }
155             else {
156                 /* stores a partially calculated target*/
157                 /* + 1 to make 0 a valid character */
158                 args->converter->toUnicodeStatus = ch + 1;
159                 args->converter->toULength = (int8_t) i;
160                 goto donefornow;
161             }
162         }
163 
164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166             if (ch <= MAXIMUM_UCS2) {
167                 /* fits in 16 bits */
168                 *(myTarget++) = (UChar) ch;
169                 *(myOffsets++) = offsetNum;
170             }
171             else {
172                 /* write out the surrogates */
173                 *(myTarget++) = U16_LEAD(ch);
174                 *myOffsets++ = offsetNum;
175                 ch = U16_TRAIL(ch);
176                 if (myTarget < targetLimit)
177                 {
178                     *(myTarget++) = (UChar)ch;
179                     *(myOffsets++) = offsetNum;
180                 }
181                 else {
182                     /* Put in overflow buffer (not handled here) */
183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
184                     args->converter->UCharErrorBufferLength = 1;
185                     *err = U_BUFFER_OVERFLOW_ERROR;
186                     break;
187                 }
188             }
189         }
190         else {
191             args->converter->toULength = (int8_t)i;
192             *err = U_ILLEGAL_CHAR_FOUND;
193             break;
194         }
195         offsetNum += i;
196     }
197 
198 donefornow:
199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
200     {
201         /* End of target buffer */
202         *err = U_BUFFER_OVERFLOW_ERROR;
203     }
204 
205     args->target = myTarget;
206     args->source = (const char *) mySource;
207     args->offsets = myOffsets;
208 }
209 
210 static void
T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,UErrorCode * err)211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
212                                   UErrorCode * err)
213 {
214     const UChar *mySource = args->source;
215     unsigned char *myTarget;
216     const UChar *sourceLimit = args->sourceLimit;
217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
218     UChar32 ch, ch2;
219     unsigned int indexToWrite;
220     unsigned char temp[sizeof(uint32_t)];
221 
222     if(mySource >= sourceLimit) {
223         /* no input, nothing to do */
224         return;
225     }
226 
227     /* write the BOM if necessary */
228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
230         ucnv_fromUWriteBytes(args->converter,
231                              bom, 4,
232                              &args->target, args->targetLimit,
233                              &args->offsets, -1,
234                              err);
235         args->converter->fromUnicodeStatus=0;
236     }
237 
238     myTarget = (unsigned char *) args->target;
239     temp[0] = 0;
240 
241     if (args->converter->fromUChar32) {
242         ch = args->converter->fromUChar32;
243         args->converter->fromUChar32 = 0;
244         goto lowsurogate;
245     }
246 
247     while (mySource < sourceLimit && myTarget < targetLimit) {
248         ch = *(mySource++);
249 
250         if (U_IS_SURROGATE(ch)) {
251             if (U_IS_LEAD(ch)) {
252 lowsurogate:
253                 if (mySource < sourceLimit) {
254                     ch2 = *mySource;
255                     if (U_IS_TRAIL(ch2)) {
256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
257                         mySource++;
258                     }
259                     else {
260                         /* this is an unmatched trail code unit (2nd surrogate) */
261                         /* callback(illegal) */
262                         args->converter->fromUChar32 = ch;
263                         *err = U_ILLEGAL_CHAR_FOUND;
264                         break;
265                     }
266                 }
267                 else {
268                     /* ran out of source */
269                     args->converter->fromUChar32 = ch;
270                     if (args->flush) {
271                         /* this is an unmatched trail code unit (2nd surrogate) */
272                         /* callback(illegal) */
273                         *err = U_ILLEGAL_CHAR_FOUND;
274                     }
275                     break;
276                 }
277             }
278             else {
279                 /* this is an unmatched trail code unit (2nd surrogate) */
280                 /* callback(illegal) */
281                 args->converter->fromUChar32 = ch;
282                 *err = U_ILLEGAL_CHAR_FOUND;
283                 break;
284             }
285         }
286 
287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
291 
292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
293             if (myTarget < targetLimit) {
294                 *(myTarget++) = temp[indexToWrite];
295             }
296             else {
297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
298                 *err = U_BUFFER_OVERFLOW_ERROR;
299             }
300         }
301     }
302 
303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
304         *err = U_BUFFER_OVERFLOW_ERROR;
305     }
306 
307     args->target = (char *) myTarget;
308     args->source = mySource;
309 }
310 
311 static void
T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
313                                                UErrorCode * err)
314 {
315     const UChar *mySource = args->source;
316     unsigned char *myTarget;
317     int32_t *myOffsets;
318     const UChar *sourceLimit = args->sourceLimit;
319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
320     UChar32 ch, ch2;
321     int32_t offsetNum = 0;
322     unsigned int indexToWrite;
323     unsigned char temp[sizeof(uint32_t)];
324 
325     if(mySource >= sourceLimit) {
326         /* no input, nothing to do */
327         return;
328     }
329 
330     /* write the BOM if necessary */
331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
333         ucnv_fromUWriteBytes(args->converter,
334                              bom, 4,
335                              &args->target, args->targetLimit,
336                              &args->offsets, -1,
337                              err);
338         args->converter->fromUnicodeStatus=0;
339     }
340 
341     myTarget = (unsigned char *) args->target;
342     myOffsets = args->offsets;
343     temp[0] = 0;
344 
345     if (args->converter->fromUChar32) {
346         ch = args->converter->fromUChar32;
347         args->converter->fromUChar32 = 0;
348         goto lowsurogate;
349     }
350 
351     while (mySource < sourceLimit && myTarget < targetLimit) {
352         ch = *(mySource++);
353 
354         if (U_IS_SURROGATE(ch)) {
355             if (U_IS_LEAD(ch)) {
356 lowsurogate:
357                 if (mySource < sourceLimit) {
358                     ch2 = *mySource;
359                     if (U_IS_TRAIL(ch2)) {
360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
361                         mySource++;
362                     }
363                     else {
364                         /* this is an unmatched trail code unit (2nd surrogate) */
365                         /* callback(illegal) */
366                         args->converter->fromUChar32 = ch;
367                         *err = U_ILLEGAL_CHAR_FOUND;
368                         break;
369                     }
370                 }
371                 else {
372                     /* ran out of source */
373                     args->converter->fromUChar32 = ch;
374                     if (args->flush) {
375                         /* this is an unmatched trail code unit (2nd surrogate) */
376                         /* callback(illegal) */
377                         *err = U_ILLEGAL_CHAR_FOUND;
378                     }
379                     break;
380                 }
381             }
382             else {
383                 /* this is an unmatched trail code unit (2nd surrogate) */
384                 /* callback(illegal) */
385                 args->converter->fromUChar32 = ch;
386                 *err = U_ILLEGAL_CHAR_FOUND;
387                 break;
388             }
389         }
390 
391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
395 
396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
397             if (myTarget < targetLimit) {
398                 *(myTarget++) = temp[indexToWrite];
399                 *(myOffsets++) = offsetNum;
400             }
401             else {
402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
403                 *err = U_BUFFER_OVERFLOW_ERROR;
404             }
405         }
406         offsetNum = offsetNum + 1 + (temp[1] != 0);
407     }
408 
409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
410         *err = U_BUFFER_OVERFLOW_ERROR;
411     }
412 
413     args->target = (char *) myTarget;
414     args->source = mySource;
415     args->offsets = myOffsets;
416 }
417 
418 static UChar32
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
420                                    UErrorCode* err)
421 {
422     const uint8_t *mySource;
423     UChar32 myUChar;
424     int32_t length;
425 
426     mySource = (const uint8_t *)args->source;
427     if (mySource >= (const uint8_t *)args->sourceLimit)
428     {
429         /* no input */
430         *err = U_INDEX_OUTOFBOUNDS_ERROR;
431         return 0xffff;
432     }
433 
434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
435     if (length < 4)
436     {
437         /* got a partial character */
438         uprv_memcpy(args->converter->toUBytes, mySource, length);
439         args->converter->toULength = (int8_t)length;
440         args->source = (const char *)(mySource + length);
441         *err = U_TRUNCATED_CHAR_FOUND;
442         return 0xffff;
443     }
444 
445     /* Don't even try to do a direct cast because the value may be on an odd address. */
446     myUChar = ((UChar32)mySource[0] << 24)
447             | ((UChar32)mySource[1] << 16)
448             | ((UChar32)mySource[2] << 8)
449             | ((UChar32)mySource[3]);
450 
451     args->source = (const char *)(mySource + 4);
452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
453         return myUChar;
454     }
455 
456     uprv_memcpy(args->converter->toUBytes, mySource, 4);
457     args->converter->toULength = 4;
458 
459     *err = U_ILLEGAL_CHAR_FOUND;
460     return 0xffff;
461 }
462 
463 static const UConverterImpl _UTF32BEImpl = {
464     UCNV_UTF32_BigEndian,
465 
466     NULL,
467     NULL,
468 
469     NULL,
470     NULL,
471     NULL,
472 
473     T_UConverter_toUnicode_UTF32_BE,
474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
475     T_UConverter_fromUnicode_UTF32_BE,
476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
477     T_UConverter_getNextUChar_UTF32_BE,
478 
479     NULL,
480     NULL,
481     NULL,
482     NULL,
483     ucnv_getNonSurrogateUnicodeSet
484 };
485 
486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487 static const UConverterStaticData _UTF32BEStaticData = {
488     sizeof(UConverterStaticData),
489     "UTF-32BE",
490     1232,
491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
493     0,
494     0,
495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
496 };
497 
498 const UConverterSharedData _UTF32BEData =
499         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
500 
501 /* UTF-32LE ---------------------------------------------------------- */
502 
503 static void
T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)504 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
505                                 UErrorCode * err)
506 {
507     const unsigned char *mySource = (unsigned char *) args->source;
508     UChar *myTarget = args->target;
509     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
510     const UChar *targetLimit = args->targetLimit;
511     unsigned char *toUBytes = args->converter->toUBytes;
512     uint32_t ch, i;
513 
514     /* Restore state of current sequence */
515     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
516     {
517         i = args->converter->toULength;       /* restore # of bytes consumed */
518         args->converter->toULength = 0;
519 
520         /* Stores the previously calculated ch from a previous call*/
521         ch = args->converter->toUnicodeStatus - 1;
522         args->converter->toUnicodeStatus = 0;
523         goto morebytes;
524     }
525 
526     while (mySource < sourceLimit && myTarget < targetLimit)
527     {
528         i = 0;
529         ch = 0;
530 morebytes:
531         while (i < sizeof(uint32_t))
532         {
533             if (mySource < sourceLimit)
534             {
535                 ch |= ((uint8_t)(*mySource)) << (i * 8);
536                 toUBytes[i++] = (char) *(mySource++);
537             }
538             else
539             {
540                 /* stores a partially calculated target*/
541                 /* + 1 to make 0 a valid character */
542                 args->converter->toUnicodeStatus = ch + 1;
543                 args->converter->toULength = (int8_t) i;
544                 goto donefornow;
545             }
546         }
547 
548         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
549             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
550             if (ch <= MAXIMUM_UCS2) {
551                 /* fits in 16 bits */
552                 *(myTarget++) = (UChar) ch;
553             }
554             else {
555                 /* write out the surrogates */
556                 *(myTarget++) = U16_LEAD(ch);
557                 ch = U16_TRAIL(ch);
558                 if (myTarget < targetLimit) {
559                     *(myTarget++) = (UChar)ch;
560                 }
561                 else {
562                     /* Put in overflow buffer (not handled here) */
563                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
564                     args->converter->UCharErrorBufferLength = 1;
565                     *err = U_BUFFER_OVERFLOW_ERROR;
566                     break;
567                 }
568             }
569         }
570         else {
571             args->converter->toULength = (int8_t)i;
572             *err = U_ILLEGAL_CHAR_FOUND;
573             break;
574         }
575     }
576 
577 donefornow:
578     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
579     {
580         /* End of target buffer */
581         *err = U_BUFFER_OVERFLOW_ERROR;
582     }
583 
584     args->target = myTarget;
585     args->source = (const char *) mySource;
586 }
587 
588 static void
T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)589 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
590                                              UErrorCode * err)
591 {
592     const unsigned char *mySource = (unsigned char *) args->source;
593     UChar *myTarget = args->target;
594     int32_t *myOffsets = args->offsets;
595     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
596     const UChar *targetLimit = args->targetLimit;
597     unsigned char *toUBytes = args->converter->toUBytes;
598     uint32_t ch, i;
599     int32_t offsetNum = 0;
600 
601     /* Restore state of current sequence */
602     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
603     {
604         i = args->converter->toULength;       /* restore # of bytes consumed */
605         args->converter->toULength = 0;
606 
607         /* Stores the previously calculated ch from a previous call*/
608         ch = args->converter->toUnicodeStatus - 1;
609         args->converter->toUnicodeStatus = 0;
610         goto morebytes;
611     }
612 
613     while (mySource < sourceLimit && myTarget < targetLimit)
614     {
615         i = 0;
616         ch = 0;
617 morebytes:
618         while (i < sizeof(uint32_t))
619         {
620             if (mySource < sourceLimit)
621             {
622                 ch |= ((uint8_t)(*mySource)) << (i * 8);
623                 toUBytes[i++] = (char) *(mySource++);
624             }
625             else
626             {
627                 /* stores a partially calculated target*/
628                 /* + 1 to make 0 a valid character */
629                 args->converter->toUnicodeStatus = ch + 1;
630                 args->converter->toULength = (int8_t) i;
631                 goto donefornow;
632             }
633         }
634 
635         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
636         {
637             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
638             if (ch <= MAXIMUM_UCS2)
639             {
640                 /* fits in 16 bits */
641                 *(myTarget++) = (UChar) ch;
642                 *(myOffsets++) = offsetNum;
643             }
644             else {
645                 /* write out the surrogates */
646                 *(myTarget++) = U16_LEAD(ch);
647                 *(myOffsets++) = offsetNum;
648                 ch = U16_TRAIL(ch);
649                 if (myTarget < targetLimit)
650                 {
651                     *(myTarget++) = (UChar)ch;
652                     *(myOffsets++) = offsetNum;
653                 }
654                 else
655                 {
656                     /* Put in overflow buffer (not handled here) */
657                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
658                     args->converter->UCharErrorBufferLength = 1;
659                     *err = U_BUFFER_OVERFLOW_ERROR;
660                     break;
661                 }
662             }
663         }
664         else
665         {
666             args->converter->toULength = (int8_t)i;
667             *err = U_ILLEGAL_CHAR_FOUND;
668             break;
669         }
670         offsetNum += i;
671     }
672 
673 donefornow:
674     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
675     {
676         /* End of target buffer */
677         *err = U_BUFFER_OVERFLOW_ERROR;
678     }
679 
680     args->target = myTarget;
681     args->source = (const char *) mySource;
682     args->offsets = myOffsets;
683 }
684 
685 static void
T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,UErrorCode * err)686 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
687                                   UErrorCode * err)
688 {
689     const UChar *mySource = args->source;
690     unsigned char *myTarget;
691     const UChar *sourceLimit = args->sourceLimit;
692     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
693     UChar32 ch, ch2;
694     unsigned int indexToWrite;
695     unsigned char temp[sizeof(uint32_t)];
696 
697     if(mySource >= sourceLimit) {
698         /* no input, nothing to do */
699         return;
700     }
701 
702     /* write the BOM if necessary */
703     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
704         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
705         ucnv_fromUWriteBytes(args->converter,
706                              bom, 4,
707                              &args->target, args->targetLimit,
708                              &args->offsets, -1,
709                              err);
710         args->converter->fromUnicodeStatus=0;
711     }
712 
713     myTarget = (unsigned char *) args->target;
714     temp[3] = 0;
715 
716     if (args->converter->fromUChar32)
717     {
718         ch = args->converter->fromUChar32;
719         args->converter->fromUChar32 = 0;
720         goto lowsurogate;
721     }
722 
723     while (mySource < sourceLimit && myTarget < targetLimit)
724     {
725         ch = *(mySource++);
726 
727         if (U16_IS_SURROGATE(ch)) {
728             if (U16_IS_LEAD(ch))
729             {
730 lowsurogate:
731                 if (mySource < sourceLimit)
732                 {
733                     ch2 = *mySource;
734                     if (U16_IS_TRAIL(ch2)) {
735                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
736                         mySource++;
737                     }
738                     else {
739                         /* this is an unmatched trail code unit (2nd surrogate) */
740                         /* callback(illegal) */
741                         args->converter->fromUChar32 = ch;
742                         *err = U_ILLEGAL_CHAR_FOUND;
743                         break;
744                     }
745                 }
746                 else {
747                     /* ran out of source */
748                     args->converter->fromUChar32 = ch;
749                     if (args->flush) {
750                         /* this is an unmatched trail code unit (2nd surrogate) */
751                         /* callback(illegal) */
752                         *err = U_ILLEGAL_CHAR_FOUND;
753                     }
754                     break;
755                 }
756             }
757             else {
758                 /* this is an unmatched trail code unit (2nd surrogate) */
759                 /* callback(illegal) */
760                 args->converter->fromUChar32 = ch;
761                 *err = U_ILLEGAL_CHAR_FOUND;
762                 break;
763             }
764         }
765 
766         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
767         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
768         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
769         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
770 
771         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
772         {
773             if (myTarget < targetLimit)
774             {
775                 *(myTarget++) = temp[indexToWrite];
776             }
777             else
778             {
779                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
780                 *err = U_BUFFER_OVERFLOW_ERROR;
781             }
782         }
783     }
784 
785     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
786     {
787         *err = U_BUFFER_OVERFLOW_ERROR;
788     }
789 
790     args->target = (char *) myTarget;
791     args->source = mySource;
792 }
793 
794 static void
T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)795 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
796                                                UErrorCode * err)
797 {
798     const UChar *mySource = args->source;
799     unsigned char *myTarget;
800     int32_t *myOffsets;
801     const UChar *sourceLimit = args->sourceLimit;
802     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
803     UChar32 ch, ch2;
804     unsigned int indexToWrite;
805     unsigned char temp[sizeof(uint32_t)];
806     int32_t offsetNum = 0;
807 
808     if(mySource >= sourceLimit) {
809         /* no input, nothing to do */
810         return;
811     }
812 
813     /* write the BOM if necessary */
814     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
815         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
816         ucnv_fromUWriteBytes(args->converter,
817                              bom, 4,
818                              &args->target, args->targetLimit,
819                              &args->offsets, -1,
820                              err);
821         args->converter->fromUnicodeStatus=0;
822     }
823 
824     myTarget = (unsigned char *) args->target;
825     myOffsets = args->offsets;
826     temp[3] = 0;
827 
828     if (args->converter->fromUChar32)
829     {
830         ch = args->converter->fromUChar32;
831         args->converter->fromUChar32 = 0;
832         goto lowsurogate;
833     }
834 
835     while (mySource < sourceLimit && myTarget < targetLimit)
836     {
837         ch = *(mySource++);
838 
839         if (U16_IS_SURROGATE(ch)) {
840             if (U16_IS_LEAD(ch))
841             {
842 lowsurogate:
843                 if (mySource < sourceLimit)
844                 {
845                     ch2 = *mySource;
846                     if (U16_IS_TRAIL(ch2))
847                     {
848                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
849                         mySource++;
850                     }
851                     else {
852                         /* this is an unmatched trail code unit (2nd surrogate) */
853                         /* callback(illegal) */
854                         args->converter->fromUChar32 = ch;
855                         *err = U_ILLEGAL_CHAR_FOUND;
856                         break;
857                     }
858                 }
859                 else {
860                     /* ran out of source */
861                     args->converter->fromUChar32 = ch;
862                     if (args->flush) {
863                         /* this is an unmatched trail code unit (2nd surrogate) */
864                         /* callback(illegal) */
865                         *err = U_ILLEGAL_CHAR_FOUND;
866                     }
867                     break;
868                 }
869             }
870             else {
871                 /* this is an unmatched trail code unit (2nd surrogate) */
872                 /* callback(illegal) */
873                 args->converter->fromUChar32 = ch;
874                 *err = U_ILLEGAL_CHAR_FOUND;
875                 break;
876             }
877         }
878 
879         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
880         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
881         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
882         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
883 
884         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
885         {
886             if (myTarget < targetLimit)
887             {
888                 *(myTarget++) = temp[indexToWrite];
889                 *(myOffsets++) = offsetNum;
890             }
891             else
892             {
893                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
894                 *err = U_BUFFER_OVERFLOW_ERROR;
895             }
896         }
897         offsetNum = offsetNum + 1 + (temp[2] != 0);
898     }
899 
900     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
901     {
902         *err = U_BUFFER_OVERFLOW_ERROR;
903     }
904 
905     args->target = (char *) myTarget;
906     args->source = mySource;
907     args->offsets = myOffsets;
908 }
909 
910 static UChar32
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)911 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
912                                    UErrorCode* err)
913 {
914     const uint8_t *mySource;
915     UChar32 myUChar;
916     int32_t length;
917 
918     mySource = (const uint8_t *)args->source;
919     if (mySource >= (const uint8_t *)args->sourceLimit)
920     {
921         /* no input */
922         *err = U_INDEX_OUTOFBOUNDS_ERROR;
923         return 0xffff;
924     }
925 
926     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
927     if (length < 4)
928     {
929         /* got a partial character */
930         uprv_memcpy(args->converter->toUBytes, mySource, length);
931         args->converter->toULength = (int8_t)length;
932         args->source = (const char *)(mySource + length);
933         *err = U_TRUNCATED_CHAR_FOUND;
934         return 0xffff;
935     }
936 
937     /* Don't even try to do a direct cast because the value may be on an odd address. */
938     myUChar = ((UChar32)mySource[3] << 24)
939             | ((UChar32)mySource[2] << 16)
940             | ((UChar32)mySource[1] << 8)
941             | ((UChar32)mySource[0]);
942 
943     args->source = (const char *)(mySource + 4);
944     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
945         return myUChar;
946     }
947 
948     uprv_memcpy(args->converter->toUBytes, mySource, 4);
949     args->converter->toULength = 4;
950 
951     *err = U_ILLEGAL_CHAR_FOUND;
952     return 0xffff;
953 }
954 
955 static const UConverterImpl _UTF32LEImpl = {
956     UCNV_UTF32_LittleEndian,
957 
958     NULL,
959     NULL,
960 
961     NULL,
962     NULL,
963     NULL,
964 
965     T_UConverter_toUnicode_UTF32_LE,
966     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
967     T_UConverter_fromUnicode_UTF32_LE,
968     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
969     T_UConverter_getNextUChar_UTF32_LE,
970 
971     NULL,
972     NULL,
973     NULL,
974     NULL,
975     ucnv_getNonSurrogateUnicodeSet
976 };
977 
978 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
979 static const UConverterStaticData _UTF32LEStaticData = {
980     sizeof(UConverterStaticData),
981     "UTF-32LE",
982     1234,
983     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
984     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
985     0,
986     0,
987     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
988 };
989 
990 
991 const UConverterSharedData _UTF32LEData =
992         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
993 
994 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
995 
996 /*
997  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
998  * accordingly.
999  *
1000  * State values:
1001  * 0    initial state
1002  * 1    saw 00
1003  * 2    saw 00 00
1004  * 3    saw 00 00 FE
1005  * 4    -
1006  * 5    saw FF
1007  * 6    saw FF FE
1008  * 7    saw FF FE 00
1009  * 8    UTF-32BE mode
1010  * 9    UTF-32LE mode
1011  *
1012  * During detection: state&3==number of matching bytes so far.
1013  *
1014  * On output, emit U+FEFF as the first code point.
1015  */
1016 
1017 static void
_UTF32Reset(UConverter * cnv,UConverterResetChoice choice)1018 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1019     if(choice<=UCNV_RESET_TO_UNICODE) {
1020         /* reset toUnicode: state=0 */
1021         cnv->mode=0;
1022     }
1023     if(choice!=UCNV_RESET_TO_UNICODE) {
1024         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1025         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1026     }
1027 }
1028 
1029 static void
_UTF32Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1030 _UTF32Open(UConverter *cnv,
1031            UConverterLoadArgs *pArgs,
1032            UErrorCode *pErrorCode) {
1033     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1034 }
1035 
1036 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1037 
1038 static void
_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1039 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1040                            UErrorCode *pErrorCode) {
1041     UConverter *cnv=pArgs->converter;
1042     const char *source=pArgs->source;
1043     const char *sourceLimit=pArgs->sourceLimit;
1044     int32_t *offsets=pArgs->offsets;
1045 
1046     int32_t state, offsetDelta;
1047     char b;
1048 
1049     state=cnv->mode;
1050 
1051     /*
1052      * If we detect a BOM in this buffer, then we must add the BOM size to the
1053      * offsets because the actual converter function will not see and count the BOM.
1054      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1055      */
1056     offsetDelta=0;
1057 
1058     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1059         switch(state) {
1060         case 0:
1061             b=*source;
1062             if(b==0) {
1063                 state=1; /* could be 00 00 FE FF */
1064             } else if(b==(char)0xff) {
1065                 state=5; /* could be FF FE 00 00 */
1066             } else {
1067                 state=8; /* default to UTF-32BE */
1068                 continue;
1069             }
1070             ++source;
1071             break;
1072         case 1:
1073         case 2:
1074         case 3:
1075         case 5:
1076         case 6:
1077         case 7:
1078             if(*source==utf32BOM[state]) {
1079                 ++state;
1080                 ++source;
1081                 if(state==4) {
1082                     state=8; /* detect UTF-32BE */
1083                     offsetDelta=(int32_t)(source-pArgs->source);
1084                 } else if(state==8) {
1085                     state=9; /* detect UTF-32LE */
1086                     offsetDelta=(int32_t)(source-pArgs->source);
1087                 }
1088             } else {
1089                 /* switch to UTF-32BE and pass the previous bytes */
1090                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1091 
1092                 /* reset the source */
1093                 source=pArgs->source;
1094 
1095                 if(count==(state&3)) {
1096                     /* simple: all in the same buffer, just reset source */
1097                 } else {
1098                     UBool oldFlush=pArgs->flush;
1099 
1100                     /* some of the bytes are from a previous buffer, replay those first */
1101                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1102                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1103                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1104 
1105                     /* no offsets: bytes from previous buffer, and not enough for output */
1106                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1107 
1108                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1109                     pArgs->sourceLimit=sourceLimit;
1110                     pArgs->flush=oldFlush;
1111                 }
1112                 state=8;
1113                 continue;
1114             }
1115             break;
1116         case 8:
1117             /* call UTF-32BE */
1118             pArgs->source=source;
1119             if(offsets==NULL) {
1120                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1121             } else {
1122                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1123             }
1124             source=pArgs->source;
1125             break;
1126         case 9:
1127             /* call UTF-32LE */
1128             pArgs->source=source;
1129             if(offsets==NULL) {
1130                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1131             } else {
1132                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1133             }
1134             source=pArgs->source;
1135             break;
1136         default:
1137             break; /* does not occur */
1138         }
1139     }
1140 
1141     /* add BOM size to offsets - see comment at offsetDelta declaration */
1142     if(offsets!=NULL && offsetDelta!=0) {
1143         int32_t *offsetsLimit=pArgs->offsets;
1144         while(offsets<offsetsLimit) {
1145             *offsets++ += offsetDelta;
1146         }
1147     }
1148 
1149     pArgs->source=source;
1150 
1151     if(source==sourceLimit && pArgs->flush) {
1152         /* handle truncated input */
1153         switch(state) {
1154         case 0:
1155             break; /* no input at all, nothing to do */
1156         case 8:
1157             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1158             break;
1159         case 9:
1160             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1161             break;
1162         default:
1163             /* handle 0<state<8: call UTF-32BE with too-short input */
1164             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1165             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1166 
1167             /* no offsets: not enough for output */
1168             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1169             pArgs->source=source;
1170             pArgs->sourceLimit=sourceLimit;
1171             state=8;
1172             break;
1173         }
1174     }
1175 
1176     cnv->mode=state;
1177 }
1178 
1179 static UChar32
_UTF32GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1180 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1181                    UErrorCode *pErrorCode) {
1182     switch(pArgs->converter->mode) {
1183     case 8:
1184         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1185     case 9:
1186         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1187     default:
1188         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1189     }
1190 }
1191 
1192 static const UConverterImpl _UTF32Impl = {
1193     UCNV_UTF32,
1194 
1195     NULL,
1196     NULL,
1197 
1198     _UTF32Open,
1199     NULL,
1200     _UTF32Reset,
1201 
1202     _UTF32ToUnicodeWithOffsets,
1203     _UTF32ToUnicodeWithOffsets,
1204 #if U_IS_BIG_ENDIAN
1205     T_UConverter_fromUnicode_UTF32_BE,
1206     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1207 #else
1208     T_UConverter_fromUnicode_UTF32_LE,
1209     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1210 #endif
1211     _UTF32GetNextUChar,
1212 
1213     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1214     NULL,
1215     NULL,
1216     NULL,
1217     ucnv_getNonSurrogateUnicodeSet
1218 };
1219 
1220 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1221 static const UConverterStaticData _UTF32StaticData = {
1222     sizeof(UConverterStaticData),
1223     "UTF-32",
1224     1236,
1225     UCNV_IBM, UCNV_UTF32, 4, 4,
1226 #if U_IS_BIG_ENDIAN
1227     { 0, 0, 0xff, 0xfd }, 4,
1228 #else
1229     { 0xfd, 0xff, 0, 0 }, 4,
1230 #endif
1231     FALSE, FALSE,
1232     0,
1233     0,
1234     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1235 };
1236 
1237 const UConverterSharedData _UTF32Data =
1238         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1239 
1240 #endif
1241