1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u32.c
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
15 *
16 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23 #include "unicode/ucnv.h"
24 #include "unicode/utf.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28
29 #define MAXIMUM_UCS2 0x0000FFFF
30 #define MAXIMUM_UTF 0x0010FFFF
31 #define HALF_SHIFT 10
32 #define HALF_BASE 0x0010000
33 #define HALF_MASK 0x3FF
34 #define SURROGATE_HIGH_START 0xD800
35 #define SURROGATE_LOW_START 0xDC00
36
37 /* -SURROGATE_LOW_START + HALF_BASE */
38 #define SURROGATE_LOW_BASE 9216
39
40 enum {
41 UCNV_NEED_TO_WRITE_BOM=1
42 };
43
44 /* UTF-32BE ----------------------------------------------------------------- */
45 U_CDECL_BEGIN
46 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
48 UErrorCode * err)
49 {
50 const unsigned char *mySource = (unsigned char *) args->source;
51 UChar *myTarget = args->target;
52 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
53 const UChar *targetLimit = args->targetLimit;
54 unsigned char *toUBytes = args->converter->toUBytes;
55 uint32_t ch, i;
56
57 /* Restore state of current sequence */
58 if (args->converter->toULength > 0 && myTarget < targetLimit) {
59 i = args->converter->toULength; /* restore # of bytes consumed */
60 args->converter->toULength = 0;
61
62 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
63 args->converter->toUnicodeStatus = 0;
64 goto morebytes;
65 }
66
67 while (mySource < sourceLimit && myTarget < targetLimit) {
68 i = 0;
69 ch = 0;
70 morebytes:
71 while (i < sizeof(uint32_t)) {
72 if (mySource < sourceLimit) {
73 ch = (ch << 8) | (uint8_t)(*mySource);
74 toUBytes[i++] = (char) *(mySource++);
75 }
76 else {
77 /* stores a partially calculated target*/
78 /* + 1 to make 0 a valid character */
79 args->converter->toUnicodeStatus = ch + 1;
80 args->converter->toULength = (int8_t) i;
81 goto donefornow;
82 }
83 }
84
85 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
86 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
87 if (ch <= MAXIMUM_UCS2)
88 {
89 /* fits in 16 bits */
90 *(myTarget++) = (UChar) ch;
91 }
92 else {
93 /* write out the surrogates */
94 *(myTarget++) = U16_LEAD(ch);
95 ch = U16_TRAIL(ch);
96 if (myTarget < targetLimit) {
97 *(myTarget++) = (UChar)ch;
98 }
99 else {
100 /* Put in overflow buffer (not handled here) */
101 args->converter->UCharErrorBuffer[0] = (UChar) ch;
102 args->converter->UCharErrorBufferLength = 1;
103 *err = U_BUFFER_OVERFLOW_ERROR;
104 break;
105 }
106 }
107 }
108 else {
109 args->converter->toULength = (int8_t)i;
110 *err = U_ILLEGAL_CHAR_FOUND;
111 break;
112 }
113 }
114
115 donefornow:
116 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
117 /* End of target buffer */
118 *err = U_BUFFER_OVERFLOW_ERROR;
119 }
120
121 args->target = myTarget;
122 args->source = (const char *) mySource;
123 }
124
125 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
127 UErrorCode * err)
128 {
129 const unsigned char *mySource = (unsigned char *) args->source;
130 UChar *myTarget = args->target;
131 int32_t *myOffsets = args->offsets;
132 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
133 const UChar *targetLimit = args->targetLimit;
134 unsigned char *toUBytes = args->converter->toUBytes;
135 uint32_t ch, i;
136 int32_t offsetNum = 0;
137
138 /* Restore state of current sequence */
139 if (args->converter->toULength > 0 && myTarget < targetLimit) {
140 i = args->converter->toULength; /* restore # of bytes consumed */
141 args->converter->toULength = 0;
142
143 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
144 args->converter->toUnicodeStatus = 0;
145 goto morebytes;
146 }
147
148 while (mySource < sourceLimit && myTarget < targetLimit) {
149 i = 0;
150 ch = 0;
151 morebytes:
152 while (i < sizeof(uint32_t)) {
153 if (mySource < sourceLimit) {
154 ch = (ch << 8) | (uint8_t)(*mySource);
155 toUBytes[i++] = (char) *(mySource++);
156 }
157 else {
158 /* stores a partially calculated target*/
159 /* + 1 to make 0 a valid character */
160 args->converter->toUnicodeStatus = ch + 1;
161 args->converter->toULength = (int8_t) i;
162 goto donefornow;
163 }
164 }
165
166 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
167 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
168 if (ch <= MAXIMUM_UCS2) {
169 /* fits in 16 bits */
170 *(myTarget++) = (UChar) ch;
171 *(myOffsets++) = offsetNum;
172 }
173 else {
174 /* write out the surrogates */
175 *(myTarget++) = U16_LEAD(ch);
176 *myOffsets++ = offsetNum;
177 ch = U16_TRAIL(ch);
178 if (myTarget < targetLimit)
179 {
180 *(myTarget++) = (UChar)ch;
181 *(myOffsets++) = offsetNum;
182 }
183 else {
184 /* Put in overflow buffer (not handled here) */
185 args->converter->UCharErrorBuffer[0] = (UChar) ch;
186 args->converter->UCharErrorBufferLength = 1;
187 *err = U_BUFFER_OVERFLOW_ERROR;
188 break;
189 }
190 }
191 }
192 else {
193 args->converter->toULength = (int8_t)i;
194 *err = U_ILLEGAL_CHAR_FOUND;
195 break;
196 }
197 offsetNum += i;
198 }
199
200 donefornow:
201 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
202 {
203 /* End of target buffer */
204 *err = U_BUFFER_OVERFLOW_ERROR;
205 }
206
207 args->target = myTarget;
208 args->source = (const char *) mySource;
209 args->offsets = myOffsets;
210 }
211
212 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,UErrorCode * err)213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
214 UErrorCode * err)
215 {
216 const UChar *mySource = args->source;
217 unsigned char *myTarget;
218 const UChar *sourceLimit = args->sourceLimit;
219 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
220 UChar32 ch, ch2;
221 unsigned int indexToWrite;
222 unsigned char temp[sizeof(uint32_t)];
223
224 if(mySource >= sourceLimit) {
225 /* no input, nothing to do */
226 return;
227 }
228
229 /* write the BOM if necessary */
230 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
231 static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu };
232 ucnv_fromUWriteBytes(args->converter,
233 bom, 4,
234 &args->target, args->targetLimit,
235 &args->offsets, -1,
236 err);
237 args->converter->fromUnicodeStatus=0;
238 }
239
240 myTarget = (unsigned char *) args->target;
241 temp[0] = 0;
242
243 if (args->converter->fromUChar32) {
244 ch = args->converter->fromUChar32;
245 args->converter->fromUChar32 = 0;
246 goto lowsurogate;
247 }
248
249 while (mySource < sourceLimit && myTarget < targetLimit) {
250 ch = *(mySource++);
251
252 if (U_IS_SURROGATE(ch)) {
253 if (U_IS_LEAD(ch)) {
254 lowsurogate:
255 if (mySource < sourceLimit) {
256 ch2 = *mySource;
257 if (U_IS_TRAIL(ch2)) {
258 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
259 mySource++;
260 }
261 else {
262 /* this is an unmatched trail code unit (2nd surrogate) */
263 /* callback(illegal) */
264 args->converter->fromUChar32 = ch;
265 *err = U_ILLEGAL_CHAR_FOUND;
266 break;
267 }
268 }
269 else {
270 /* ran out of source */
271 args->converter->fromUChar32 = ch;
272 if (args->flush) {
273 /* this is an unmatched trail code unit (2nd surrogate) */
274 /* callback(illegal) */
275 *err = U_ILLEGAL_CHAR_FOUND;
276 }
277 break;
278 }
279 }
280 else {
281 /* this is an unmatched trail code unit (2nd surrogate) */
282 /* callback(illegal) */
283 args->converter->fromUChar32 = ch;
284 *err = U_ILLEGAL_CHAR_FOUND;
285 break;
286 }
287 }
288
289 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
290 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
291 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
292 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
293
294 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
295 if (myTarget < targetLimit) {
296 *(myTarget++) = temp[indexToWrite];
297 }
298 else {
299 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
300 *err = U_BUFFER_OVERFLOW_ERROR;
301 }
302 }
303 }
304
305 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
306 *err = U_BUFFER_OVERFLOW_ERROR;
307 }
308
309 args->target = (char *) myTarget;
310 args->source = mySource;
311 }
312
313 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
315 UErrorCode * err)
316 {
317 const UChar *mySource = args->source;
318 unsigned char *myTarget;
319 int32_t *myOffsets;
320 const UChar *sourceLimit = args->sourceLimit;
321 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
322 UChar32 ch, ch2;
323 int32_t offsetNum = 0;
324 unsigned int indexToWrite;
325 unsigned char temp[sizeof(uint32_t)];
326
327 if(mySource >= sourceLimit) {
328 /* no input, nothing to do */
329 return;
330 }
331
332 /* write the BOM if necessary */
333 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
334 static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu };
335 ucnv_fromUWriteBytes(args->converter,
336 bom, 4,
337 &args->target, args->targetLimit,
338 &args->offsets, -1,
339 err);
340 args->converter->fromUnicodeStatus=0;
341 }
342
343 myTarget = (unsigned char *) args->target;
344 myOffsets = args->offsets;
345 temp[0] = 0;
346
347 if (args->converter->fromUChar32) {
348 ch = args->converter->fromUChar32;
349 args->converter->fromUChar32 = 0;
350 goto lowsurogate;
351 }
352
353 while (mySource < sourceLimit && myTarget < targetLimit) {
354 ch = *(mySource++);
355
356 if (U_IS_SURROGATE(ch)) {
357 if (U_IS_LEAD(ch)) {
358 lowsurogate:
359 if (mySource < sourceLimit) {
360 ch2 = *mySource;
361 if (U_IS_TRAIL(ch2)) {
362 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
363 mySource++;
364 }
365 else {
366 /* this is an unmatched trail code unit (2nd surrogate) */
367 /* callback(illegal) */
368 args->converter->fromUChar32 = ch;
369 *err = U_ILLEGAL_CHAR_FOUND;
370 break;
371 }
372 }
373 else {
374 /* ran out of source */
375 args->converter->fromUChar32 = ch;
376 if (args->flush) {
377 /* this is an unmatched trail code unit (2nd surrogate) */
378 /* callback(illegal) */
379 *err = U_ILLEGAL_CHAR_FOUND;
380 }
381 break;
382 }
383 }
384 else {
385 /* this is an unmatched trail code unit (2nd surrogate) */
386 /* callback(illegal) */
387 args->converter->fromUChar32 = ch;
388 *err = U_ILLEGAL_CHAR_FOUND;
389 break;
390 }
391 }
392
393 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
394 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
395 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
396 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
397
398 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
399 if (myTarget < targetLimit) {
400 *(myTarget++) = temp[indexToWrite];
401 *(myOffsets++) = offsetNum;
402 }
403 else {
404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
405 *err = U_BUFFER_OVERFLOW_ERROR;
406 }
407 }
408 offsetNum = offsetNum + 1 + (temp[1] != 0);
409 }
410
411 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
412 *err = U_BUFFER_OVERFLOW_ERROR;
413 }
414
415 args->target = (char *) myTarget;
416 args->source = mySource;
417 args->offsets = myOffsets;
418 }
419
420 static UChar32 U_CALLCONV
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
422 UErrorCode* err)
423 {
424 const uint8_t *mySource;
425 UChar32 myUChar;
426 int32_t length;
427
428 mySource = (const uint8_t *)args->source;
429 if (mySource >= (const uint8_t *)args->sourceLimit)
430 {
431 /* no input */
432 *err = U_INDEX_OUTOFBOUNDS_ERROR;
433 return 0xffff;
434 }
435
436 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
437 if (length < 4)
438 {
439 /* got a partial character */
440 uprv_memcpy(args->converter->toUBytes, mySource, length);
441 args->converter->toULength = (int8_t)length;
442 args->source = (const char *)(mySource + length);
443 *err = U_TRUNCATED_CHAR_FOUND;
444 return 0xffff;
445 }
446
447 /* Don't even try to do a direct cast because the value may be on an odd address. */
448 myUChar = ((UChar32)mySource[0] << 24)
449 | ((UChar32)mySource[1] << 16)
450 | ((UChar32)mySource[2] << 8)
451 | ((UChar32)mySource[3]);
452
453 args->source = (const char *)(mySource + 4);
454 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
455 return myUChar;
456 }
457
458 uprv_memcpy(args->converter->toUBytes, mySource, 4);
459 args->converter->toULength = 4;
460
461 *err = U_ILLEGAL_CHAR_FOUND;
462 return 0xffff;
463 }
464 U_CDECL_END
465 static const UConverterImpl _UTF32BEImpl = {
466 UCNV_UTF32_BigEndian,
467
468 NULL,
469 NULL,
470
471 NULL,
472 NULL,
473 NULL,
474
475 T_UConverter_toUnicode_UTF32_BE,
476 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
477 T_UConverter_fromUnicode_UTF32_BE,
478 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
479 T_UConverter_getNextUChar_UTF32_BE,
480
481 NULL,
482 NULL,
483 NULL,
484 NULL,
485 ucnv_getNonSurrogateUnicodeSet,
486
487 NULL,
488 NULL
489 };
490
491 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
492 static const UConverterStaticData _UTF32BEStaticData = {
493 sizeof(UConverterStaticData),
494 "UTF-32BE",
495 1232,
496 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
497 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
498 0,
499 0,
500 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
501 };
502
503 const UConverterSharedData _UTF32BEData =
504 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
505
506 /* UTF-32LE ---------------------------------------------------------- */
507 U_CDECL_BEGIN
508 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
510 UErrorCode * err)
511 {
512 const unsigned char *mySource = (unsigned char *) args->source;
513 UChar *myTarget = args->target;
514 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
515 const UChar *targetLimit = args->targetLimit;
516 unsigned char *toUBytes = args->converter->toUBytes;
517 uint32_t ch, i;
518
519 /* Restore state of current sequence */
520 if (args->converter->toULength > 0 && myTarget < targetLimit)
521 {
522 i = args->converter->toULength; /* restore # of bytes consumed */
523 args->converter->toULength = 0;
524
525 /* Stores the previously calculated ch from a previous call*/
526 ch = args->converter->toUnicodeStatus - 1;
527 args->converter->toUnicodeStatus = 0;
528 goto morebytes;
529 }
530
531 while (mySource < sourceLimit && myTarget < targetLimit)
532 {
533 i = 0;
534 ch = 0;
535 morebytes:
536 while (i < sizeof(uint32_t))
537 {
538 if (mySource < sourceLimit)
539 {
540 ch |= ((uint8_t)(*mySource)) << (i * 8);
541 toUBytes[i++] = (char) *(mySource++);
542 }
543 else
544 {
545 /* stores a partially calculated target*/
546 /* + 1 to make 0 a valid character */
547 args->converter->toUnicodeStatus = ch + 1;
548 args->converter->toULength = (int8_t) i;
549 goto donefornow;
550 }
551 }
552
553 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
554 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
555 if (ch <= MAXIMUM_UCS2) {
556 /* fits in 16 bits */
557 *(myTarget++) = (UChar) ch;
558 }
559 else {
560 /* write out the surrogates */
561 *(myTarget++) = U16_LEAD(ch);
562 ch = U16_TRAIL(ch);
563 if (myTarget < targetLimit) {
564 *(myTarget++) = (UChar)ch;
565 }
566 else {
567 /* Put in overflow buffer (not handled here) */
568 args->converter->UCharErrorBuffer[0] = (UChar) ch;
569 args->converter->UCharErrorBufferLength = 1;
570 *err = U_BUFFER_OVERFLOW_ERROR;
571 break;
572 }
573 }
574 }
575 else {
576 args->converter->toULength = (int8_t)i;
577 *err = U_ILLEGAL_CHAR_FOUND;
578 break;
579 }
580 }
581
582 donefornow:
583 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
584 {
585 /* End of target buffer */
586 *err = U_BUFFER_OVERFLOW_ERROR;
587 }
588
589 args->target = myTarget;
590 args->source = (const char *) mySource;
591 }
592
593 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
595 UErrorCode * err)
596 {
597 const unsigned char *mySource = (unsigned char *) args->source;
598 UChar *myTarget = args->target;
599 int32_t *myOffsets = args->offsets;
600 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
601 const UChar *targetLimit = args->targetLimit;
602 unsigned char *toUBytes = args->converter->toUBytes;
603 uint32_t ch, i;
604 int32_t offsetNum = 0;
605
606 /* Restore state of current sequence */
607 if (args->converter->toULength > 0 && myTarget < targetLimit)
608 {
609 i = args->converter->toULength; /* restore # of bytes consumed */
610 args->converter->toULength = 0;
611
612 /* Stores the previously calculated ch from a previous call*/
613 ch = args->converter->toUnicodeStatus - 1;
614 args->converter->toUnicodeStatus = 0;
615 goto morebytes;
616 }
617
618 while (mySource < sourceLimit && myTarget < targetLimit)
619 {
620 i = 0;
621 ch = 0;
622 morebytes:
623 while (i < sizeof(uint32_t))
624 {
625 if (mySource < sourceLimit)
626 {
627 ch |= ((uint8_t)(*mySource)) << (i * 8);
628 toUBytes[i++] = (char) *(mySource++);
629 }
630 else
631 {
632 /* stores a partially calculated target*/
633 /* + 1 to make 0 a valid character */
634 args->converter->toUnicodeStatus = ch + 1;
635 args->converter->toULength = (int8_t) i;
636 goto donefornow;
637 }
638 }
639
640 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
641 {
642 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
643 if (ch <= MAXIMUM_UCS2)
644 {
645 /* fits in 16 bits */
646 *(myTarget++) = (UChar) ch;
647 *(myOffsets++) = offsetNum;
648 }
649 else {
650 /* write out the surrogates */
651 *(myTarget++) = U16_LEAD(ch);
652 *(myOffsets++) = offsetNum;
653 ch = U16_TRAIL(ch);
654 if (myTarget < targetLimit)
655 {
656 *(myTarget++) = (UChar)ch;
657 *(myOffsets++) = offsetNum;
658 }
659 else
660 {
661 /* Put in overflow buffer (not handled here) */
662 args->converter->UCharErrorBuffer[0] = (UChar) ch;
663 args->converter->UCharErrorBufferLength = 1;
664 *err = U_BUFFER_OVERFLOW_ERROR;
665 break;
666 }
667 }
668 }
669 else
670 {
671 args->converter->toULength = (int8_t)i;
672 *err = U_ILLEGAL_CHAR_FOUND;
673 break;
674 }
675 offsetNum += i;
676 }
677
678 donefornow:
679 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
680 {
681 /* End of target buffer */
682 *err = U_BUFFER_OVERFLOW_ERROR;
683 }
684
685 args->target = myTarget;
686 args->source = (const char *) mySource;
687 args->offsets = myOffsets;
688 }
689
690 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,UErrorCode * err)691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
692 UErrorCode * err)
693 {
694 const UChar *mySource = args->source;
695 unsigned char *myTarget;
696 const UChar *sourceLimit = args->sourceLimit;
697 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
698 UChar32 ch, ch2;
699 unsigned int indexToWrite;
700 unsigned char temp[sizeof(uint32_t)];
701
702 if(mySource >= sourceLimit) {
703 /* no input, nothing to do */
704 return;
705 }
706
707 /* write the BOM if necessary */
708 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
709 static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 };
710 ucnv_fromUWriteBytes(args->converter,
711 bom, 4,
712 &args->target, args->targetLimit,
713 &args->offsets, -1,
714 err);
715 args->converter->fromUnicodeStatus=0;
716 }
717
718 myTarget = (unsigned char *) args->target;
719 temp[3] = 0;
720
721 if (args->converter->fromUChar32)
722 {
723 ch = args->converter->fromUChar32;
724 args->converter->fromUChar32 = 0;
725 goto lowsurogate;
726 }
727
728 while (mySource < sourceLimit && myTarget < targetLimit)
729 {
730 ch = *(mySource++);
731
732 if (U16_IS_SURROGATE(ch)) {
733 if (U16_IS_LEAD(ch))
734 {
735 lowsurogate:
736 if (mySource < sourceLimit)
737 {
738 ch2 = *mySource;
739 if (U16_IS_TRAIL(ch2)) {
740 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
741 mySource++;
742 }
743 else {
744 /* this is an unmatched trail code unit (2nd surrogate) */
745 /* callback(illegal) */
746 args->converter->fromUChar32 = ch;
747 *err = U_ILLEGAL_CHAR_FOUND;
748 break;
749 }
750 }
751 else {
752 /* ran out of source */
753 args->converter->fromUChar32 = ch;
754 if (args->flush) {
755 /* this is an unmatched trail code unit (2nd surrogate) */
756 /* callback(illegal) */
757 *err = U_ILLEGAL_CHAR_FOUND;
758 }
759 break;
760 }
761 }
762 else {
763 /* this is an unmatched trail code unit (2nd surrogate) */
764 /* callback(illegal) */
765 args->converter->fromUChar32 = ch;
766 *err = U_ILLEGAL_CHAR_FOUND;
767 break;
768 }
769 }
770
771 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
772 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
773 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
774 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
775
776 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
777 {
778 if (myTarget < targetLimit)
779 {
780 *(myTarget++) = temp[indexToWrite];
781 }
782 else
783 {
784 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
785 *err = U_BUFFER_OVERFLOW_ERROR;
786 }
787 }
788 }
789
790 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
791 {
792 *err = U_BUFFER_OVERFLOW_ERROR;
793 }
794
795 args->target = (char *) myTarget;
796 args->source = mySource;
797 }
798
799 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
801 UErrorCode * err)
802 {
803 const UChar *mySource = args->source;
804 unsigned char *myTarget;
805 int32_t *myOffsets;
806 const UChar *sourceLimit = args->sourceLimit;
807 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
808 UChar32 ch, ch2;
809 unsigned int indexToWrite;
810 unsigned char temp[sizeof(uint32_t)];
811 int32_t offsetNum = 0;
812
813 if(mySource >= sourceLimit) {
814 /* no input, nothing to do */
815 return;
816 }
817
818 /* write the BOM if necessary */
819 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
820 static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 };
821 ucnv_fromUWriteBytes(args->converter,
822 bom, 4,
823 &args->target, args->targetLimit,
824 &args->offsets, -1,
825 err);
826 args->converter->fromUnicodeStatus=0;
827 }
828
829 myTarget = (unsigned char *) args->target;
830 myOffsets = args->offsets;
831 temp[3] = 0;
832
833 if (args->converter->fromUChar32)
834 {
835 ch = args->converter->fromUChar32;
836 args->converter->fromUChar32 = 0;
837 goto lowsurogate;
838 }
839
840 while (mySource < sourceLimit && myTarget < targetLimit)
841 {
842 ch = *(mySource++);
843
844 if (U16_IS_SURROGATE(ch)) {
845 if (U16_IS_LEAD(ch))
846 {
847 lowsurogate:
848 if (mySource < sourceLimit)
849 {
850 ch2 = *mySource;
851 if (U16_IS_TRAIL(ch2))
852 {
853 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
854 mySource++;
855 }
856 else {
857 /* this is an unmatched trail code unit (2nd surrogate) */
858 /* callback(illegal) */
859 args->converter->fromUChar32 = ch;
860 *err = U_ILLEGAL_CHAR_FOUND;
861 break;
862 }
863 }
864 else {
865 /* ran out of source */
866 args->converter->fromUChar32 = ch;
867 if (args->flush) {
868 /* this is an unmatched trail code unit (2nd surrogate) */
869 /* callback(illegal) */
870 *err = U_ILLEGAL_CHAR_FOUND;
871 }
872 break;
873 }
874 }
875 else {
876 /* this is an unmatched trail code unit (2nd surrogate) */
877 /* callback(illegal) */
878 args->converter->fromUChar32 = ch;
879 *err = U_ILLEGAL_CHAR_FOUND;
880 break;
881 }
882 }
883
884 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
885 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
886 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
887 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
888
889 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
890 {
891 if (myTarget < targetLimit)
892 {
893 *(myTarget++) = temp[indexToWrite];
894 *(myOffsets++) = offsetNum;
895 }
896 else
897 {
898 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
899 *err = U_BUFFER_OVERFLOW_ERROR;
900 }
901 }
902 offsetNum = offsetNum + 1 + (temp[2] != 0);
903 }
904
905 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
906 {
907 *err = U_BUFFER_OVERFLOW_ERROR;
908 }
909
910 args->target = (char *) myTarget;
911 args->source = mySource;
912 args->offsets = myOffsets;
913 }
914
915 static UChar32 U_CALLCONV
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
917 UErrorCode* err)
918 {
919 const uint8_t *mySource;
920 UChar32 myUChar;
921 int32_t length;
922
923 mySource = (const uint8_t *)args->source;
924 if (mySource >= (const uint8_t *)args->sourceLimit)
925 {
926 /* no input */
927 *err = U_INDEX_OUTOFBOUNDS_ERROR;
928 return 0xffff;
929 }
930
931 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
932 if (length < 4)
933 {
934 /* got a partial character */
935 uprv_memcpy(args->converter->toUBytes, mySource, length);
936 args->converter->toULength = (int8_t)length;
937 args->source = (const char *)(mySource + length);
938 *err = U_TRUNCATED_CHAR_FOUND;
939 return 0xffff;
940 }
941
942 /* Don't even try to do a direct cast because the value may be on an odd address. */
943 myUChar = ((UChar32)mySource[3] << 24)
944 | ((UChar32)mySource[2] << 16)
945 | ((UChar32)mySource[1] << 8)
946 | ((UChar32)mySource[0]);
947
948 args->source = (const char *)(mySource + 4);
949 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
950 return myUChar;
951 }
952
953 uprv_memcpy(args->converter->toUBytes, mySource, 4);
954 args->converter->toULength = 4;
955
956 *err = U_ILLEGAL_CHAR_FOUND;
957 return 0xffff;
958 }
959 U_CDECL_END
960 static const UConverterImpl _UTF32LEImpl = {
961 UCNV_UTF32_LittleEndian,
962
963 NULL,
964 NULL,
965
966 NULL,
967 NULL,
968 NULL,
969
970 T_UConverter_toUnicode_UTF32_LE,
971 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
972 T_UConverter_fromUnicode_UTF32_LE,
973 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
974 T_UConverter_getNextUChar_UTF32_LE,
975
976 NULL,
977 NULL,
978 NULL,
979 NULL,
980 ucnv_getNonSurrogateUnicodeSet,
981
982 NULL,
983 NULL
984 };
985
986 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
987 static const UConverterStaticData _UTF32LEStaticData = {
988 sizeof(UConverterStaticData),
989 "UTF-32LE",
990 1234,
991 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
992 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
993 0,
994 0,
995 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
996 };
997
998
999 const UConverterSharedData _UTF32LEData =
1000 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
1001
1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1003
1004 /*
1005 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1006 * accordingly.
1007 *
1008 * State values:
1009 * 0 initial state
1010 * 1 saw 00
1011 * 2 saw 00 00
1012 * 3 saw 00 00 FE
1013 * 4 -
1014 * 5 saw FF
1015 * 6 saw FF FE
1016 * 7 saw FF FE 00
1017 * 8 UTF-32BE mode
1018 * 9 UTF-32LE mode
1019 *
1020 * During detection: state&3==number of matching bytes so far.
1021 *
1022 * On output, emit U+FEFF as the first code point.
1023 */
1024 U_CDECL_BEGIN
1025 static void U_CALLCONV
_UTF32Reset(UConverter * cnv,UConverterResetChoice choice)1026 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1027 if(choice<=UCNV_RESET_TO_UNICODE) {
1028 /* reset toUnicode: state=0 */
1029 cnv->mode=0;
1030 }
1031 if(choice!=UCNV_RESET_TO_UNICODE) {
1032 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1033 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1034 }
1035 }
1036
1037 static void U_CALLCONV
_UTF32Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1038 _UTF32Open(UConverter *cnv,
1039 UConverterLoadArgs *pArgs,
1040 UErrorCode *pErrorCode) {
1041 (void)pArgs;
1042 (void)pErrorCode;
1043 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1044 }
1045
1046 static const char utf32BOM[8]={ 0, 0, (char)0xfeu, (char)0xffu, (char)0xffu, (char)0xfeu, 0, 0 };
1047
1048 static void U_CALLCONV
_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1049 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1050 UErrorCode *pErrorCode) {
1051 UConverter *cnv=pArgs->converter;
1052 const char *source=pArgs->source;
1053 const char *sourceLimit=pArgs->sourceLimit;
1054 int32_t *offsets=pArgs->offsets;
1055
1056 int32_t state, offsetDelta;
1057 char b;
1058
1059 state=cnv->mode;
1060
1061 /*
1062 * If we detect a BOM in this buffer, then we must add the BOM size to the
1063 * offsets because the actual converter function will not see and count the BOM.
1064 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1065 */
1066 offsetDelta=0;
1067
1068 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1069 switch(state) {
1070 case 0:
1071 b=*source;
1072 if(b==0) {
1073 state=1; /* could be 00 00 FE FF */
1074 } else if(b==(char)0xffu) {
1075 state=5; /* could be FF FE 00 00 */
1076 } else {
1077 state=8; /* default to UTF-32BE */
1078 continue;
1079 }
1080 ++source;
1081 break;
1082 case 1:
1083 case 2:
1084 case 3:
1085 case 5:
1086 case 6:
1087 case 7:
1088 if(*source==utf32BOM[state]) {
1089 ++state;
1090 ++source;
1091 if(state==4) {
1092 state=8; /* detect UTF-32BE */
1093 offsetDelta=(int32_t)(source-pArgs->source);
1094 } else if(state==8) {
1095 state=9; /* detect UTF-32LE */
1096 offsetDelta=(int32_t)(source-pArgs->source);
1097 }
1098 } else {
1099 /* switch to UTF-32BE and pass the previous bytes */
1100 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1101
1102 /* reset the source */
1103 source=pArgs->source;
1104
1105 if(count==(state&3)) {
1106 /* simple: all in the same buffer, just reset source */
1107 } else {
1108 UBool oldFlush=pArgs->flush;
1109
1110 /* some of the bytes are from a previous buffer, replay those first */
1111 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1112 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1113 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1114
1115 /* no offsets: bytes from previous buffer, and not enough for output */
1116 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1117
1118 /* restore real pointers; pArgs->source will be set in case 8/9 */
1119 pArgs->sourceLimit=sourceLimit;
1120 pArgs->flush=oldFlush;
1121 }
1122 state=8;
1123 continue;
1124 }
1125 break;
1126 case 8:
1127 /* call UTF-32BE */
1128 pArgs->source=source;
1129 if(offsets==NULL) {
1130 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1131 } else {
1132 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1133 }
1134 source=pArgs->source;
1135 break;
1136 case 9:
1137 /* call UTF-32LE */
1138 pArgs->source=source;
1139 if(offsets==NULL) {
1140 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1141 } else {
1142 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1143 }
1144 source=pArgs->source;
1145 break;
1146 default:
1147 break; /* does not occur */
1148 }
1149 }
1150
1151 /* add BOM size to offsets - see comment at offsetDelta declaration */
1152 if(offsets!=NULL && offsetDelta!=0) {
1153 int32_t *offsetsLimit=pArgs->offsets;
1154 while(offsets<offsetsLimit) {
1155 *offsets++ += offsetDelta;
1156 }
1157 }
1158
1159 pArgs->source=source;
1160
1161 if(source==sourceLimit && pArgs->flush) {
1162 /* handle truncated input */
1163 switch(state) {
1164 case 0:
1165 break; /* no input at all, nothing to do */
1166 case 8:
1167 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1168 break;
1169 case 9:
1170 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1171 break;
1172 default:
1173 /* handle 0<state<8: call UTF-32BE with too-short input */
1174 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1175 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1176
1177 /* no offsets: not enough for output */
1178 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1179 pArgs->source=source;
1180 pArgs->sourceLimit=sourceLimit;
1181 state=8;
1182 break;
1183 }
1184 }
1185
1186 cnv->mode=state;
1187 }
1188
1189 static UChar32 U_CALLCONV
_UTF32GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1190 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1191 UErrorCode *pErrorCode) {
1192 switch(pArgs->converter->mode) {
1193 case 8:
1194 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1195 case 9:
1196 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1197 default:
1198 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1199 }
1200 }
1201 U_CDECL_END
1202 static const UConverterImpl _UTF32Impl = {
1203 UCNV_UTF32,
1204
1205 NULL,
1206 NULL,
1207
1208 _UTF32Open,
1209 NULL,
1210 _UTF32Reset,
1211
1212 _UTF32ToUnicodeWithOffsets,
1213 _UTF32ToUnicodeWithOffsets,
1214 #if U_IS_BIG_ENDIAN
1215 T_UConverter_fromUnicode_UTF32_BE,
1216 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1217 #else
1218 T_UConverter_fromUnicode_UTF32_LE,
1219 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1220 #endif
1221 _UTF32GetNextUChar,
1222
1223 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1224 NULL,
1225 NULL,
1226 NULL,
1227 ucnv_getNonSurrogateUnicodeSet,
1228
1229 NULL,
1230 NULL
1231 };
1232
1233 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1234 static const UConverterStaticData _UTF32StaticData = {
1235 sizeof(UConverterStaticData),
1236 "UTF-32",
1237 1236,
1238 UCNV_IBM, UCNV_UTF32, 4, 4,
1239 #if U_IS_BIG_ENDIAN
1240 { 0, 0, 0xff, 0xfd }, 4,
1241 #else
1242 { 0xfd, 0xff, 0, 0 }, 4,
1243 #endif
1244 FALSE, FALSE,
1245 0,
1246 0,
1247 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1248 };
1249
1250 const UConverterSharedData _UTF32Data =
1251 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1252
1253 #endif
1254