1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u8.c
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
15 *
16 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
17 *
18 * Also, CESU-8 implementation, see UTR 26.
19 * The CESU-8 converter uses all the same functions as the
20 * UTF-8 converter, with a branch for converting supplementary code points.
21 */
22
23 #include "unicode/utypes.h"
24
25 #if !UCONFIG_NO_CONVERSION
26
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
31 #include "uassert.h"
32 #include "ucnv_bld.h"
33 #include "ucnv_cnv.h"
34 #include "cmemory.h"
35 #include "ustr_imp.h"
36
37 /* Prototypes --------------------------------------------------------------- */
38
39 /* Keep these here to make finicky compilers happy */
40
41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
42 UErrorCode *err);
43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
44 UErrorCode *err);
45
46
47 /* UTF-8 -------------------------------------------------------------------- */
48
49 #define MAXIMUM_UCS2 0x0000FFFF
50
51 static const uint32_t offsetsFromUTF8[5] = {0,
52 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
53 (uint32_t) 0x03C82080
54 };
55
hasCESU8Data(const UConverter * cnv)56 static UBool hasCESU8Data(const UConverter *cnv)
57 {
58 #if UCONFIG_ONLY_HTML_CONVERSION
59 return FALSE;
60 #else
61 return (UBool)(cnv->sharedData == &_CESU8Data);
62 #endif
63 }
64 U_CDECL_BEGIN
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)65 static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
66 UErrorCode * err)
67 {
68 UConverter *cnv = args->converter;
69 const unsigned char *mySource = (unsigned char *) args->source;
70 UChar *myTarget = args->target;
71 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
72 const UChar *targetLimit = args->targetLimit;
73 unsigned char *toUBytes = cnv->toUBytes;
74 UBool isCESU8 = hasCESU8Data(cnv);
75 uint32_t ch, ch2 = 0;
76 int32_t i, inBytes;
77
78 /* Restore size of current sequence */
79 if (cnv->toULength > 0 && myTarget < targetLimit)
80 {
81 inBytes = cnv->mode; /* restore # of bytes to consume */
82 i = cnv->toULength; /* restore # of bytes consumed */
83 cnv->toULength = 0;
84
85 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
86 cnv->toUnicodeStatus = 0;
87 goto morebytes;
88 }
89
90
91 while (mySource < sourceLimit && myTarget < targetLimit)
92 {
93 ch = *(mySource++);
94 if (U8_IS_SINGLE(ch)) /* Simple case */
95 {
96 *(myTarget++) = (UChar) ch;
97 }
98 else
99 {
100 /* store the first char */
101 toUBytes[0] = (char)ch;
102 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
103 i = 1;
104
105 morebytes:
106 while (i < inBytes)
107 {
108 if (mySource < sourceLimit)
109 {
110 toUBytes[i] = (char) (ch2 = *mySource);
111 if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
112 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
113 {
114 break; /* i < inBytes */
115 }
116 ch = (ch << 6) + ch2;
117 ++mySource;
118 i++;
119 }
120 else
121 {
122 /* stores a partially calculated target*/
123 cnv->toUnicodeStatus = ch;
124 cnv->mode = inBytes;
125 cnv->toULength = (int8_t) i;
126 goto donefornow;
127 }
128 }
129
130 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131 if (i == inBytes && (!isCESU8 || i <= 3))
132 {
133 /* Remove the accumulated high bits */
134 ch -= offsetsFromUTF8[inBytes];
135
136 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137 if (ch <= MAXIMUM_UCS2)
138 {
139 /* fits in 16 bits */
140 *(myTarget++) = (UChar) ch;
141 }
142 else
143 {
144 /* write out the surrogates */
145 *(myTarget++) = U16_LEAD(ch);
146 ch = U16_TRAIL(ch);
147 if (myTarget < targetLimit)
148 {
149 *(myTarget++) = (UChar)ch;
150 }
151 else
152 {
153 /* Put in overflow buffer (not handled here) */
154 cnv->UCharErrorBuffer[0] = (UChar) ch;
155 cnv->UCharErrorBufferLength = 1;
156 *err = U_BUFFER_OVERFLOW_ERROR;
157 break;
158 }
159 }
160 }
161 else
162 {
163 cnv->toULength = (int8_t)i;
164 *err = U_ILLEGAL_CHAR_FOUND;
165 break;
166 }
167 }
168 }
169
170 donefornow:
171 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172 {
173 /* End of target buffer */
174 *err = U_BUFFER_OVERFLOW_ERROR;
175 }
176
177 args->target = myTarget;
178 args->source = (const char *) mySource;
179 }
180
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)181 static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
182 UErrorCode * err)
183 {
184 UConverter *cnv = args->converter;
185 const unsigned char *mySource = (unsigned char *) args->source;
186 UChar *myTarget = args->target;
187 int32_t *myOffsets = args->offsets;
188 int32_t offsetNum = 0;
189 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
190 const UChar *targetLimit = args->targetLimit;
191 unsigned char *toUBytes = cnv->toUBytes;
192 UBool isCESU8 = hasCESU8Data(cnv);
193 uint32_t ch, ch2 = 0;
194 int32_t i, inBytes;
195
196 /* Restore size of current sequence */
197 if (cnv->toULength > 0 && myTarget < targetLimit)
198 {
199 inBytes = cnv->mode; /* restore # of bytes to consume */
200 i = cnv->toULength; /* restore # of bytes consumed */
201 cnv->toULength = 0;
202
203 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
204 cnv->toUnicodeStatus = 0;
205 goto morebytes;
206 }
207
208 while (mySource < sourceLimit && myTarget < targetLimit)
209 {
210 ch = *(mySource++);
211 if (U8_IS_SINGLE(ch)) /* Simple case */
212 {
213 *(myTarget++) = (UChar) ch;
214 *(myOffsets++) = offsetNum++;
215 }
216 else
217 {
218 toUBytes[0] = (char)ch;
219 inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
220 i = 1;
221
222 morebytes:
223 while (i < inBytes)
224 {
225 if (mySource < sourceLimit)
226 {
227 toUBytes[i] = (char) (ch2 = *mySource);
228 if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
229 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
230 {
231 break; /* i < inBytes */
232 }
233 ch = (ch << 6) + ch2;
234 ++mySource;
235 i++;
236 }
237 else
238 {
239 cnv->toUnicodeStatus = ch;
240 cnv->mode = inBytes;
241 cnv->toULength = (int8_t)i;
242 goto donefornow;
243 }
244 }
245
246 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247 if (i == inBytes && (!isCESU8 || i <= 3))
248 {
249 /* Remove the accumulated high bits */
250 ch -= offsetsFromUTF8[inBytes];
251
252 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253 if (ch <= MAXIMUM_UCS2)
254 {
255 /* fits in 16 bits */
256 *(myTarget++) = (UChar) ch;
257 *(myOffsets++) = offsetNum;
258 }
259 else
260 {
261 /* write out the surrogates */
262 *(myTarget++) = U16_LEAD(ch);
263 *(myOffsets++) = offsetNum;
264 ch = U16_TRAIL(ch);
265 if (myTarget < targetLimit)
266 {
267 *(myTarget++) = (UChar)ch;
268 *(myOffsets++) = offsetNum;
269 }
270 else
271 {
272 cnv->UCharErrorBuffer[0] = (UChar) ch;
273 cnv->UCharErrorBufferLength = 1;
274 *err = U_BUFFER_OVERFLOW_ERROR;
275 }
276 }
277 offsetNum += i;
278 }
279 else
280 {
281 cnv->toULength = (int8_t)i;
282 *err = U_ILLEGAL_CHAR_FOUND;
283 break;
284 }
285 }
286 }
287
288 donefornow:
289 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290 { /* End of target buffer */
291 *err = U_BUFFER_OVERFLOW_ERROR;
292 }
293
294 args->target = myTarget;
295 args->source = (const char *) mySource;
296 args->offsets = myOffsets;
297 }
298 U_CDECL_END
299
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)300 U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
301 UErrorCode * err)
302 {
303 UConverter *cnv = args->converter;
304 const UChar *mySource = args->source;
305 const UChar *sourceLimit = args->sourceLimit;
306 uint8_t *myTarget = (uint8_t *) args->target;
307 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
308 uint8_t *tempPtr;
309 UChar32 ch;
310 uint8_t tempBuf[4];
311 int32_t indexToWrite;
312 UBool isNotCESU8 = !hasCESU8Data(cnv);
313
314 if (cnv->fromUChar32 && myTarget < targetLimit)
315 {
316 ch = cnv->fromUChar32;
317 cnv->fromUChar32 = 0;
318 goto lowsurrogate;
319 }
320
321 while (mySource < sourceLimit && myTarget < targetLimit)
322 {
323 ch = *(mySource++);
324
325 if (ch < 0x80) /* Single byte */
326 {
327 *(myTarget++) = (uint8_t) ch;
328 }
329 else if (ch < 0x800) /* Double byte */
330 {
331 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
332 if (myTarget < targetLimit)
333 {
334 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
335 }
336 else
337 {
338 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
339 cnv->charErrorBufferLength = 1;
340 *err = U_BUFFER_OVERFLOW_ERROR;
341 }
342 }
343 else {
344 /* Check for surrogates */
345 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
346 lowsurrogate:
347 if (mySource < sourceLimit) {
348 /* test both code units */
349 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
350 /* convert and consume this supplementary code point */
351 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
352 ++mySource;
353 /* exit this condition tree */
354 }
355 else {
356 /* this is an unpaired trail or lead code unit */
357 /* callback(illegal) */
358 cnv->fromUChar32 = ch;
359 *err = U_ILLEGAL_CHAR_FOUND;
360 break;
361 }
362 }
363 else {
364 /* no more input */
365 cnv->fromUChar32 = ch;
366 break;
367 }
368 }
369
370 /* Do we write the buffer directly for speed,
371 or do we have to be careful about target buffer space? */
372 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
373
374 if (ch <= MAXIMUM_UCS2) {
375 indexToWrite = 2;
376 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
377 }
378 else {
379 indexToWrite = 3;
380 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
381 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
382 }
383 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
384 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
385
386 if (tempPtr == myTarget) {
387 /* There was enough space to write the codepoint directly. */
388 myTarget += (indexToWrite + 1);
389 }
390 else {
391 /* We might run out of room soon. Write it slowly. */
392 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393 if (myTarget < targetLimit) {
394 *(myTarget++) = *tempPtr;
395 }
396 else {
397 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398 *err = U_BUFFER_OVERFLOW_ERROR;
399 }
400 }
401 }
402 }
403 }
404
405 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406 {
407 *err = U_BUFFER_OVERFLOW_ERROR;
408 }
409
410 args->target = (char *) myTarget;
411 args->source = mySource;
412 }
413
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)414 U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
415 UErrorCode * err)
416 {
417 UConverter *cnv = args->converter;
418 const UChar *mySource = args->source;
419 int32_t *myOffsets = args->offsets;
420 const UChar *sourceLimit = args->sourceLimit;
421 uint8_t *myTarget = (uint8_t *) args->target;
422 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
423 uint8_t *tempPtr;
424 UChar32 ch;
425 int32_t offsetNum, nextSourceIndex;
426 int32_t indexToWrite;
427 uint8_t tempBuf[4];
428 UBool isNotCESU8 = !hasCESU8Data(cnv);
429
430 if (cnv->fromUChar32 && myTarget < targetLimit)
431 {
432 ch = cnv->fromUChar32;
433 cnv->fromUChar32 = 0;
434 offsetNum = -1;
435 nextSourceIndex = 0;
436 goto lowsurrogate;
437 } else {
438 offsetNum = 0;
439 }
440
441 while (mySource < sourceLimit && myTarget < targetLimit)
442 {
443 ch = *(mySource++);
444
445 if (ch < 0x80) /* Single byte */
446 {
447 *(myOffsets++) = offsetNum++;
448 *(myTarget++) = (char) ch;
449 }
450 else if (ch < 0x800) /* Double byte */
451 {
452 *(myOffsets++) = offsetNum;
453 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
454 if (myTarget < targetLimit)
455 {
456 *(myOffsets++) = offsetNum++;
457 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
458 }
459 else
460 {
461 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
462 cnv->charErrorBufferLength = 1;
463 *err = U_BUFFER_OVERFLOW_ERROR;
464 }
465 }
466 else
467 /* Check for surrogates */
468 {
469 nextSourceIndex = offsetNum + 1;
470
471 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
472 lowsurrogate:
473 if (mySource < sourceLimit) {
474 /* test both code units */
475 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
476 /* convert and consume this supplementary code point */
477 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
478 ++mySource;
479 ++nextSourceIndex;
480 /* exit this condition tree */
481 }
482 else {
483 /* this is an unpaired trail or lead code unit */
484 /* callback(illegal) */
485 cnv->fromUChar32 = ch;
486 *err = U_ILLEGAL_CHAR_FOUND;
487 break;
488 }
489 }
490 else {
491 /* no more input */
492 cnv->fromUChar32 = ch;
493 break;
494 }
495 }
496
497 /* Do we write the buffer directly for speed,
498 or do we have to be careful about target buffer space? */
499 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
500
501 if (ch <= MAXIMUM_UCS2) {
502 indexToWrite = 2;
503 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
504 }
505 else {
506 indexToWrite = 3;
507 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
508 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
509 }
510 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
511 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
512
513 if (tempPtr == myTarget) {
514 /* There was enough space to write the codepoint directly. */
515 myTarget += (indexToWrite + 1);
516 myOffsets[0] = offsetNum;
517 myOffsets[1] = offsetNum;
518 myOffsets[2] = offsetNum;
519 if (indexToWrite >= 3) {
520 myOffsets[3] = offsetNum;
521 }
522 myOffsets += (indexToWrite + 1);
523 }
524 else {
525 /* We might run out of room soon. Write it slowly. */
526 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527 if (myTarget < targetLimit)
528 {
529 *(myOffsets++) = offsetNum;
530 *(myTarget++) = *tempPtr;
531 }
532 else
533 {
534 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535 *err = U_BUFFER_OVERFLOW_ERROR;
536 }
537 }
538 }
539 offsetNum = nextSourceIndex;
540 }
541 }
542
543 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544 {
545 *err = U_BUFFER_OVERFLOW_ERROR;
546 }
547
548 args->target = (char *) myTarget;
549 args->source = mySource;
550 args->offsets = myOffsets;
551 }
552
553 U_CDECL_BEGIN
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
555 UErrorCode *err) {
556 UConverter *cnv;
557 const uint8_t *sourceInitial;
558 const uint8_t *source;
559 uint8_t myByte;
560 UChar32 ch;
561 int8_t i;
562
563 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
564
565 cnv = args->converter;
566 sourceInitial = source = (const uint8_t *)args->source;
567 if (source >= (const uint8_t *)args->sourceLimit)
568 {
569 /* no input */
570 *err = U_INDEX_OUTOFBOUNDS_ERROR;
571 return 0xffff;
572 }
573
574 myByte = (uint8_t)*(source++);
575 if (U8_IS_SINGLE(myByte))
576 {
577 args->source = (const char *)source;
578 return (UChar32)myByte;
579 }
580
581 uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582 if (countTrailBytes == 0) {
583 cnv->toUBytes[0] = myByte;
584 cnv->toULength = 1;
585 *err = U_ILLEGAL_CHAR_FOUND;
586 args->source = (const char *)source;
587 return 0xffff;
588 }
589
590 /*The byte sequence is longer than the buffer area passed*/
591 if (((const char *)source + countTrailBytes) > args->sourceLimit)
592 {
593 /* check if all of the remaining bytes are trail bytes */
594 uint16_t extraBytesToWrite = countTrailBytes + 1;
595 cnv->toUBytes[0] = myByte;
596 i = 1;
597 *err = U_TRUNCATED_CHAR_FOUND;
598 while(source < (const uint8_t *)args->sourceLimit) {
599 uint8_t b = *source;
600 if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601 cnv->toUBytes[i++] = b;
602 ++source;
603 } else {
604 /* error even before we run out of input */
605 *err = U_ILLEGAL_CHAR_FOUND;
606 break;
607 }
608 }
609 cnv->toULength = i;
610 args->source = (const char *)source;
611 return 0xffff;
612 }
613
614 ch = myByte << 6;
615 if(countTrailBytes == 2) {
616 uint8_t t1 = *source, t2;
617 if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618 args->source = (const char *)(source + 1);
619 return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
620 }
621 } else if(countTrailBytes == 1) {
622 uint8_t t1 = *source;
623 if(U8_IS_TRAIL(t1)) {
624 args->source = (const char *)(source + 1);
625 return (ch + t1) - offsetsFromUTF8[2];
626 }
627 } else { // countTrailBytes == 3
628 uint8_t t1 = *source, t2, t3;
629 if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630 U8_IS_TRAIL(t3 = *++source)) {
631 args->source = (const char *)(source + 1);
632 return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
633 }
634 }
635 args->source = (const char *)source;
636
637 for(i = 0; sourceInitial < source; ++i) {
638 cnv->toUBytes[i] = *sourceInitial++;
639 }
640 cnv->toULength = i;
641 *err = U_ILLEGAL_CHAR_FOUND;
642 return 0xffff;
643 }
644 U_CDECL_END
645
646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
647
648 U_CDECL_BEGIN
649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
650 static void U_CALLCONV
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652 UConverterToUnicodeArgs *pToUArgs,
653 UErrorCode *pErrorCode) {
654 UConverter *utf8;
655 const uint8_t *source, *sourceLimit;
656 uint8_t *target;
657 int32_t targetCapacity;
658 int32_t count;
659
660 int8_t oldToULength, toULength, toULimit;
661
662 UChar32 c;
663 uint8_t b, t1, t2;
664
665 /* set up the local pointers */
666 utf8=pToUArgs->converter;
667 source=(uint8_t *)pToUArgs->source;
668 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669 target=(uint8_t *)pFromUArgs->target;
670 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671
672 /* get the converter state from the UTF-8 UConverter */
673 if(utf8->toULength > 0) {
674 toULength=oldToULength=utf8->toULength;
675 toULimit=(int8_t)utf8->mode;
676 c=(UChar32)utf8->toUnicodeStatus;
677 } else {
678 toULength=oldToULength=toULimit=0;
679 c = 0;
680 }
681
682 count=(int32_t)(sourceLimit-source)+oldToULength;
683 if(count<toULimit) {
684 /*
685 * Not enough input to complete the partial character.
686 * Jump to moreBytes below - it will not output to target.
687 */
688 } else if(targetCapacity<toULimit) {
689 /*
690 * Not enough target capacity to output the partial character.
691 * Let the standard converter handle this.
692 */
693 *pErrorCode=U_USING_DEFAULT_WARNING;
694 return;
695 } else {
696 // Use a single counter for source and target, counting the minimum of
697 // the source length and the target capacity.
698 // Let the standard converter handle edge cases.
699 if(count>targetCapacity) {
700 count=targetCapacity;
701 }
702
703 // The conversion loop checks count>0 only once per character.
704 // If the buffer ends with a truncated sequence,
705 // then we reduce the count to stop before that,
706 // and collect the remaining bytes after the conversion loop.
707
708 // Do not go back into the bytes that will be read for finishing a partial
709 // sequence from the previous buffer.
710 int32_t length=count-toULimit;
711 U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
712 count=toULimit+length;
713 }
714
715 if(c!=0) {
716 utf8->toUnicodeStatus=0;
717 utf8->toULength=0;
718 goto moreBytes;
719 /* See note in ucnv_SBCSFromUTF8() about this goto. */
720 }
721
722 /* conversion loop */
723 while(count>0) {
724 b=*source++;
725 if(U8_IS_SINGLE(b)) {
726 /* convert ASCII */
727 *target++=b;
728 --count;
729 continue;
730 } else {
731 if(b>=0xe0) {
732 if( /* handle U+0800..U+FFFF inline */
733 b<0xf0 &&
734 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
735 U8_IS_TRAIL(t2=source[1])
736 ) {
737 source+=2;
738 *target++=b;
739 *target++=t1;
740 *target++=t2;
741 count-=3;
742 continue;
743 }
744 } else {
745 if( /* handle U+0080..U+07FF inline */
746 b>=0xc2 &&
747 U8_IS_TRAIL(t1=*source)
748 ) {
749 ++source;
750 *target++=b;
751 *target++=t1;
752 count-=2;
753 continue;
754 }
755 }
756
757 /* handle "complicated" and error cases, and continuing partial characters */
758 oldToULength=0;
759 toULength=1;
760 toULimit=U8_COUNT_BYTES_NON_ASCII(b);
761 c=b;
762 moreBytes:
763 while(toULength<toULimit) {
764 if(source<sourceLimit) {
765 b=*source;
766 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
767 ++source;
768 ++toULength;
769 c=(c<<6)+b;
770 } else {
771 break; /* sequence too short, stop with toULength<toULimit */
772 }
773 } else {
774 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
775 source-=(toULength-oldToULength);
776 while(oldToULength<toULength) {
777 utf8->toUBytes[oldToULength++]=*source++;
778 }
779 utf8->toUnicodeStatus=c;
780 utf8->toULength=toULength;
781 utf8->mode=toULimit;
782 pToUArgs->source=(char *)source;
783 pFromUArgs->target=(char *)target;
784 return;
785 }
786 }
787
788 if(toULength!=toULimit) {
789 /* error handling: illegal UTF-8 byte sequence */
790 source-=(toULength-oldToULength);
791 while(oldToULength<toULength) {
792 utf8->toUBytes[oldToULength++]=*source++;
793 }
794 utf8->toULength=toULength;
795 pToUArgs->source=(char *)source;
796 pFromUArgs->target=(char *)target;
797 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
798 return;
799 }
800
801 /* copy the legal byte sequence to the target */
802 {
803 int8_t i;
804
805 for(i=0; i<oldToULength; ++i) {
806 *target++=utf8->toUBytes[i];
807 }
808 source-=(toULength-oldToULength);
809 for(; i<toULength; ++i) {
810 *target++=*source++;
811 }
812 count-=toULength;
813 }
814 }
815 }
816 U_ASSERT(count>=0);
817
818 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
819 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
820 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
821 } else {
822 b=*source;
823 toULimit=U8_COUNT_BYTES(b);
824 if(toULimit>(sourceLimit-source)) {
825 /* collect a truncated byte sequence */
826 toULength=0;
827 c=b;
828 for(;;) {
829 utf8->toUBytes[toULength++]=b;
830 if(++source==sourceLimit) {
831 /* partial byte sequence at end of source */
832 utf8->toUnicodeStatus=c;
833 utf8->toULength=toULength;
834 utf8->mode=toULimit;
835 break;
836 } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
837 utf8->toULength=toULength;
838 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
839 break;
840 }
841 c=(c<<6)+b;
842 }
843 } else {
844 /* partial-sequence target overflow: fall back to the pivoting implementation */
845 *pErrorCode=U_USING_DEFAULT_WARNING;
846 }
847 }
848 }
849
850 /* write back the updated pointers */
851 pToUArgs->source=(char *)source;
852 pFromUArgs->target=(char *)target;
853 }
854
855 U_CDECL_END
856
857 /* UTF-8 converter data ----------------------------------------------------- */
858
859 static const UConverterImpl _UTF8Impl={
860 UCNV_UTF8,
861
862 NULL,
863 NULL,
864
865 NULL,
866 NULL,
867 NULL,
868
869 ucnv_toUnicode_UTF8,
870 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
871 ucnv_fromUnicode_UTF8,
872 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
873 ucnv_getNextUChar_UTF8,
874
875 NULL,
876 NULL,
877 NULL,
878 NULL,
879 ucnv_getNonSurrogateUnicodeSet,
880
881 ucnv_UTF8FromUTF8,
882 ucnv_UTF8FromUTF8
883 };
884
885 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
886 static const UConverterStaticData _UTF8StaticData={
887 sizeof(UConverterStaticData),
888 "UTF-8",
889 1208, UCNV_IBM, UCNV_UTF8,
890 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
891 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
892 0,
893 0,
894 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
895 };
896
897
898 const UConverterSharedData _UTF8Data=
899 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
900
901 /* CESU-8 converter data ---------------------------------------------------- */
902
903 static const UConverterImpl _CESU8Impl={
904 UCNV_CESU8,
905
906 NULL,
907 NULL,
908
909 NULL,
910 NULL,
911 NULL,
912
913 ucnv_toUnicode_UTF8,
914 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
915 ucnv_fromUnicode_UTF8,
916 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
917 NULL,
918
919 NULL,
920 NULL,
921 NULL,
922 NULL,
923 ucnv_getCompleteUnicodeSet,
924
925 NULL,
926 NULL
927 };
928
929 static const UConverterStaticData _CESU8StaticData={
930 sizeof(UConverterStaticData),
931 "CESU-8",
932 9400, /* CCSID for CESU-8 */
933 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
934 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
935 0,
936 0,
937 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
938 };
939
940
941 const UConverterSharedData _CESU8Data=
942 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
943
944 #endif
945