• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv_u8.c
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jul01
14 *   created by: Markus W. Scherer
15 *
16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
17 *
18 *   Also, CESU-8 implementation, see UTR 26.
19 *   The CESU-8 converter uses all the same functions as the
20 *   UTF-8 converter, with a branch for converting supplementary code points.
21 */
22 
23 #include "unicode/utypes.h"
24 
25 #if !UCONFIG_NO_CONVERSION
26 
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
31 #include "uassert.h"
32 #include "ucnv_bld.h"
33 #include "ucnv_cnv.h"
34 #include "cmemory.h"
35 #include "ustr_imp.h"
36 
37 /* Prototypes --------------------------------------------------------------- */
38 
39 /* Keep these here to make finicky compilers happy */
40 
41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
42                                            UErrorCode *err);
43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
44                                                         UErrorCode *err);
45 
46 
47 /* UTF-8 -------------------------------------------------------------------- */
48 
49 #define MAXIMUM_UCS2            0x0000FFFF
50 
51 static const uint32_t offsetsFromUTF8[5] = {0,
52   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
53   (uint32_t) 0x03C82080
54 };
55 
hasCESU8Data(const UConverter * cnv)56 static UBool hasCESU8Data(const UConverter *cnv)
57 {
58 #if UCONFIG_ONLY_HTML_CONVERSION
59     return FALSE;
60 #else
61     return (UBool)(cnv->sharedData == &_CESU8Data);
62 #endif
63 }
64 U_CDECL_BEGIN
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)65 static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
66                                   UErrorCode * err)
67 {
68     UConverter *cnv = args->converter;
69     const unsigned char *mySource = (unsigned char *) args->source;
70     UChar *myTarget = args->target;
71     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
72     const UChar *targetLimit = args->targetLimit;
73     unsigned char *toUBytes = cnv->toUBytes;
74     UBool isCESU8 = hasCESU8Data(cnv);
75     uint32_t ch, ch2 = 0;
76     int32_t i, inBytes;
77 
78     /* Restore size of current sequence */
79     if (cnv->toUnicodeStatus && myTarget < targetLimit)
80     {
81         inBytes = cnv->mode;            /* restore # of bytes to consume */
82         i = cnv->toULength;             /* restore # of bytes consumed */
83         cnv->toULength = 0;
84 
85         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
86         cnv->toUnicodeStatus = 0;
87         goto morebytes;
88     }
89 
90 
91     while (mySource < sourceLimit && myTarget < targetLimit)
92     {
93         ch = *(mySource++);
94         if (U8_IS_SINGLE(ch))        /* Simple case */
95         {
96             *(myTarget++) = (UChar) ch;
97         }
98         else
99         {
100             /* store the first char */
101             toUBytes[0] = (char)ch;
102             inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
103             i = 1;
104 
105 morebytes:
106             while (i < inBytes)
107             {
108                 if (mySource < sourceLimit)
109                 {
110                     toUBytes[i] = (char) (ch2 = *mySource);
111                     if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
112                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
113                     {
114                         break; /* i < inBytes */
115                     }
116                     ch = (ch << 6) + ch2;
117                     ++mySource;
118                     i++;
119                 }
120                 else
121                 {
122                     /* stores a partially calculated target*/
123                     cnv->toUnicodeStatus = ch;
124                     cnv->mode = inBytes;
125                     cnv->toULength = (int8_t) i;
126                     goto donefornow;
127                 }
128             }
129 
130             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131             if (i == inBytes && (!isCESU8 || i <= 3))
132             {
133                 /* Remove the accumulated high bits */
134                 ch -= offsetsFromUTF8[inBytes];
135 
136                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137                 if (ch <= MAXIMUM_UCS2)
138                 {
139                     /* fits in 16 bits */
140                     *(myTarget++) = (UChar) ch;
141                 }
142                 else
143                 {
144                     /* write out the surrogates */
145                     *(myTarget++) = U16_LEAD(ch);
146                     ch = U16_TRAIL(ch);
147                     if (myTarget < targetLimit)
148                     {
149                         *(myTarget++) = (UChar)ch;
150                     }
151                     else
152                     {
153                         /* Put in overflow buffer (not handled here) */
154                         cnv->UCharErrorBuffer[0] = (UChar) ch;
155                         cnv->UCharErrorBufferLength = 1;
156                         *err = U_BUFFER_OVERFLOW_ERROR;
157                         break;
158                     }
159                 }
160             }
161             else
162             {
163                 cnv->toULength = (int8_t)i;
164                 *err = U_ILLEGAL_CHAR_FOUND;
165                 break;
166             }
167         }
168     }
169 
170 donefornow:
171     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172     {
173         /* End of target buffer */
174         *err = U_BUFFER_OVERFLOW_ERROR;
175     }
176 
177     args->target = myTarget;
178     args->source = (const char *) mySource;
179 }
180 
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)181 static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
182                                                 UErrorCode * err)
183 {
184     UConverter *cnv = args->converter;
185     const unsigned char *mySource = (unsigned char *) args->source;
186     UChar *myTarget = args->target;
187     int32_t *myOffsets = args->offsets;
188     int32_t offsetNum = 0;
189     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
190     const UChar *targetLimit = args->targetLimit;
191     unsigned char *toUBytes = cnv->toUBytes;
192     UBool isCESU8 = hasCESU8Data(cnv);
193     uint32_t ch, ch2 = 0;
194     int32_t i, inBytes;
195 
196     /* Restore size of current sequence */
197     if (cnv->toUnicodeStatus && myTarget < targetLimit)
198     {
199         inBytes = cnv->mode;            /* restore # of bytes to consume */
200         i = cnv->toULength;             /* restore # of bytes consumed */
201         cnv->toULength = 0;
202 
203         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
204         cnv->toUnicodeStatus = 0;
205         goto morebytes;
206     }
207 
208     while (mySource < sourceLimit && myTarget < targetLimit)
209     {
210         ch = *(mySource++);
211         if (U8_IS_SINGLE(ch))        /* Simple case */
212         {
213             *(myTarget++) = (UChar) ch;
214             *(myOffsets++) = offsetNum++;
215         }
216         else
217         {
218             toUBytes[0] = (char)ch;
219             inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
220             i = 1;
221 
222 morebytes:
223             while (i < inBytes)
224             {
225                 if (mySource < sourceLimit)
226                 {
227                     toUBytes[i] = (char) (ch2 = *mySource);
228                     if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
229                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
230                     {
231                         break; /* i < inBytes */
232                     }
233                     ch = (ch << 6) + ch2;
234                     ++mySource;
235                     i++;
236                 }
237                 else
238                 {
239                     cnv->toUnicodeStatus = ch;
240                     cnv->mode = inBytes;
241                     cnv->toULength = (int8_t)i;
242                     goto donefornow;
243                 }
244             }
245 
246             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247             if (i == inBytes && (!isCESU8 || i <= 3))
248             {
249                 /* Remove the accumulated high bits */
250                 ch -= offsetsFromUTF8[inBytes];
251 
252                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253                 if (ch <= MAXIMUM_UCS2)
254                 {
255                     /* fits in 16 bits */
256                     *(myTarget++) = (UChar) ch;
257                     *(myOffsets++) = offsetNum;
258                 }
259                 else
260                 {
261                     /* write out the surrogates */
262                     *(myTarget++) = U16_LEAD(ch);
263                     *(myOffsets++) = offsetNum;
264                     ch = U16_TRAIL(ch);
265                     if (myTarget < targetLimit)
266                     {
267                         *(myTarget++) = (UChar)ch;
268                         *(myOffsets++) = offsetNum;
269                     }
270                     else
271                     {
272                         cnv->UCharErrorBuffer[0] = (UChar) ch;
273                         cnv->UCharErrorBufferLength = 1;
274                         *err = U_BUFFER_OVERFLOW_ERROR;
275                     }
276                 }
277                 offsetNum += i;
278             }
279             else
280             {
281                 cnv->toULength = (int8_t)i;
282                 *err = U_ILLEGAL_CHAR_FOUND;
283                 break;
284             }
285         }
286     }
287 
288 donefornow:
289     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290     {   /* End of target buffer */
291         *err = U_BUFFER_OVERFLOW_ERROR;
292     }
293 
294     args->target = myTarget;
295     args->source = (const char *) mySource;
296     args->offsets = myOffsets;
297 }
298 U_CDECL_END
299 
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)300 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
301                                     UErrorCode * err)
302 {
303     UConverter *cnv = args->converter;
304     const UChar *mySource = args->source;
305     const UChar *sourceLimit = args->sourceLimit;
306     uint8_t *myTarget = (uint8_t *) args->target;
307     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
308     uint8_t *tempPtr;
309     UChar32 ch;
310     uint8_t tempBuf[4];
311     int32_t indexToWrite;
312     UBool isNotCESU8 = !hasCESU8Data(cnv);
313 
314     if (cnv->fromUChar32 && myTarget < targetLimit)
315     {
316         ch = cnv->fromUChar32;
317         cnv->fromUChar32 = 0;
318         goto lowsurrogate;
319     }
320 
321     while (mySource < sourceLimit && myTarget < targetLimit)
322     {
323         ch = *(mySource++);
324 
325         if (ch < 0x80)        /* Single byte */
326         {
327             *(myTarget++) = (uint8_t) ch;
328         }
329         else if (ch < 0x800)  /* Double byte */
330         {
331             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
332             if (myTarget < targetLimit)
333             {
334                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
335             }
336             else
337             {
338                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
339                 cnv->charErrorBufferLength = 1;
340                 *err = U_BUFFER_OVERFLOW_ERROR;
341             }
342         }
343         else {
344             /* Check for surrogates */
345             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
346 lowsurrogate:
347                 if (mySource < sourceLimit) {
348                     /* test both code units */
349                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
350                         /* convert and consume this supplementary code point */
351                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
352                         ++mySource;
353                         /* exit this condition tree */
354                     }
355                     else {
356                         /* this is an unpaired trail or lead code unit */
357                         /* callback(illegal) */
358                         cnv->fromUChar32 = ch;
359                         *err = U_ILLEGAL_CHAR_FOUND;
360                         break;
361                     }
362                 }
363                 else {
364                     /* no more input */
365                     cnv->fromUChar32 = ch;
366                     break;
367                 }
368             }
369 
370             /* Do we write the buffer directly for speed,
371             or do we have to be careful about target buffer space? */
372             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
373 
374             if (ch <= MAXIMUM_UCS2) {
375                 indexToWrite = 2;
376                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
377             }
378             else {
379                 indexToWrite = 3;
380                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
381                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
382             }
383             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
384             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
385 
386             if (tempPtr == myTarget) {
387                 /* There was enough space to write the codepoint directly. */
388                 myTarget += (indexToWrite + 1);
389             }
390             else {
391                 /* We might run out of room soon. Write it slowly. */
392                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393                     if (myTarget < targetLimit) {
394                         *(myTarget++) = *tempPtr;
395                     }
396                     else {
397                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398                         *err = U_BUFFER_OVERFLOW_ERROR;
399                     }
400                 }
401             }
402         }
403     }
404 
405     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406     {
407         *err = U_BUFFER_OVERFLOW_ERROR;
408     }
409 
410     args->target = (char *) myTarget;
411     args->source = mySource;
412 }
413 
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)414 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
415                                                   UErrorCode * err)
416 {
417     UConverter *cnv = args->converter;
418     const UChar *mySource = args->source;
419     int32_t *myOffsets = args->offsets;
420     const UChar *sourceLimit = args->sourceLimit;
421     uint8_t *myTarget = (uint8_t *) args->target;
422     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
423     uint8_t *tempPtr;
424     UChar32 ch;
425     int32_t offsetNum, nextSourceIndex;
426     int32_t indexToWrite;
427     uint8_t tempBuf[4];
428     UBool isNotCESU8 = !hasCESU8Data(cnv);
429 
430     if (cnv->fromUChar32 && myTarget < targetLimit)
431     {
432         ch = cnv->fromUChar32;
433         cnv->fromUChar32 = 0;
434         offsetNum = -1;
435         nextSourceIndex = 0;
436         goto lowsurrogate;
437     } else {
438         offsetNum = 0;
439     }
440 
441     while (mySource < sourceLimit && myTarget < targetLimit)
442     {
443         ch = *(mySource++);
444 
445         if (ch < 0x80)        /* Single byte */
446         {
447             *(myOffsets++) = offsetNum++;
448             *(myTarget++) = (char) ch;
449         }
450         else if (ch < 0x800)  /* Double byte */
451         {
452             *(myOffsets++) = offsetNum;
453             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
454             if (myTarget < targetLimit)
455             {
456                 *(myOffsets++) = offsetNum++;
457                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
458             }
459             else
460             {
461                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
462                 cnv->charErrorBufferLength = 1;
463                 *err = U_BUFFER_OVERFLOW_ERROR;
464             }
465         }
466         else
467         /* Check for surrogates */
468         {
469             nextSourceIndex = offsetNum + 1;
470 
471             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
472 lowsurrogate:
473                 if (mySource < sourceLimit) {
474                     /* test both code units */
475                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
476                         /* convert and consume this supplementary code point */
477                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
478                         ++mySource;
479                         ++nextSourceIndex;
480                         /* exit this condition tree */
481                     }
482                     else {
483                         /* this is an unpaired trail or lead code unit */
484                         /* callback(illegal) */
485                         cnv->fromUChar32 = ch;
486                         *err = U_ILLEGAL_CHAR_FOUND;
487                         break;
488                     }
489                 }
490                 else {
491                     /* no more input */
492                     cnv->fromUChar32 = ch;
493                     break;
494                 }
495             }
496 
497             /* Do we write the buffer directly for speed,
498             or do we have to be careful about target buffer space? */
499             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
500 
501             if (ch <= MAXIMUM_UCS2) {
502                 indexToWrite = 2;
503                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
504             }
505             else {
506                 indexToWrite = 3;
507                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
508                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
509             }
510             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
511             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
512 
513             if (tempPtr == myTarget) {
514                 /* There was enough space to write the codepoint directly. */
515                 myTarget += (indexToWrite + 1);
516                 myOffsets[0] = offsetNum;
517                 myOffsets[1] = offsetNum;
518                 myOffsets[2] = offsetNum;
519                 if (indexToWrite >= 3) {
520                     myOffsets[3] = offsetNum;
521                 }
522                 myOffsets += (indexToWrite + 1);
523             }
524             else {
525                 /* We might run out of room soon. Write it slowly. */
526                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527                     if (myTarget < targetLimit)
528                     {
529                         *(myOffsets++) = offsetNum;
530                         *(myTarget++) = *tempPtr;
531                     }
532                     else
533                     {
534                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535                         *err = U_BUFFER_OVERFLOW_ERROR;
536                     }
537                 }
538             }
539             offsetNum = nextSourceIndex;
540         }
541     }
542 
543     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544     {
545         *err = U_BUFFER_OVERFLOW_ERROR;
546     }
547 
548     args->target = (char *) myTarget;
549     args->source = mySource;
550     args->offsets = myOffsets;
551 }
552 
553 U_CDECL_BEGIN
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
555                                                UErrorCode *err) {
556     UConverter *cnv;
557     const uint8_t *sourceInitial;
558     const uint8_t *source;
559     uint8_t myByte;
560     UChar32 ch;
561     int8_t i;
562 
563     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
564 
565     cnv = args->converter;
566     sourceInitial = source = (const uint8_t *)args->source;
567     if (source >= (const uint8_t *)args->sourceLimit)
568     {
569         /* no input */
570         *err = U_INDEX_OUTOFBOUNDS_ERROR;
571         return 0xffff;
572     }
573 
574     myByte = (uint8_t)*(source++);
575     if (U8_IS_SINGLE(myByte))
576     {
577         args->source = (const char *)source;
578         return (UChar32)myByte;
579     }
580 
581     uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582     if (countTrailBytes == 0) {
583         cnv->toUBytes[0] = myByte;
584         cnv->toULength = 1;
585         *err = U_ILLEGAL_CHAR_FOUND;
586         args->source = (const char *)source;
587         return 0xffff;
588     }
589 
590     /*The byte sequence is longer than the buffer area passed*/
591     if (((const char *)source + countTrailBytes) > args->sourceLimit)
592     {
593         /* check if all of the remaining bytes are trail bytes */
594         uint16_t extraBytesToWrite = countTrailBytes + 1;
595         cnv->toUBytes[0] = myByte;
596         i = 1;
597         *err = U_TRUNCATED_CHAR_FOUND;
598         while(source < (const uint8_t *)args->sourceLimit) {
599             uint8_t b = *source;
600             if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601                 cnv->toUBytes[i++] = b;
602                 ++source;
603             } else {
604                 /* error even before we run out of input */
605                 *err = U_ILLEGAL_CHAR_FOUND;
606                 break;
607             }
608         }
609         cnv->toULength = i;
610         args->source = (const char *)source;
611         return 0xffff;
612     }
613 
614     ch = myByte << 6;
615     if(countTrailBytes == 2) {
616         uint8_t t1 = *source, t2;
617         if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618             args->source = (const char *)(source + 1);
619             return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
620         }
621     } else if(countTrailBytes == 1) {
622         uint8_t t1 = *source;
623         if(U8_IS_TRAIL(t1)) {
624             args->source = (const char *)(source + 1);
625             return (ch + t1) - offsetsFromUTF8[2];
626         }
627     } else {  // countTrailBytes == 3
628         uint8_t t1 = *source, t2, t3;
629         if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630                 U8_IS_TRAIL(t3 = *++source)) {
631             args->source = (const char *)(source + 1);
632             return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
633         }
634     }
635     args->source = (const char *)source;
636 
637     for(i = 0; sourceInitial < source; ++i) {
638         cnv->toUBytes[i] = *sourceInitial++;
639     }
640     cnv->toULength = i;
641     *err = U_ILLEGAL_CHAR_FOUND;
642     return 0xffff;
643 }
644 U_CDECL_END
645 
646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
647 
648 U_CDECL_BEGIN
649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
650 static void U_CALLCONV
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652                   UConverterToUnicodeArgs *pToUArgs,
653                   UErrorCode *pErrorCode) {
654     UConverter *utf8;
655     const uint8_t *source, *sourceLimit;
656     uint8_t *target;
657     int32_t targetCapacity;
658     int32_t count;
659 
660     int8_t oldToULength, toULength, toULimit;
661 
662     UChar32 c;
663     uint8_t b, t1, t2;
664 
665     /* set up the local pointers */
666     utf8=pToUArgs->converter;
667     source=(uint8_t *)pToUArgs->source;
668     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669     target=(uint8_t *)pFromUArgs->target;
670     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671 
672     /* get the converter state from the UTF-8 UConverter */
673     c=(UChar32)utf8->toUnicodeStatus;
674     if(c!=0) {
675         toULength=oldToULength=utf8->toULength;
676         toULimit=(int8_t)utf8->mode;
677     } else {
678         toULength=oldToULength=toULimit=0;
679     }
680 
681     count=(int32_t)(sourceLimit-source)+oldToULength;
682     if(count<toULimit) {
683         /*
684          * Not enough input to complete the partial character.
685          * Jump to moreBytes below - it will not output to target.
686          */
687     } else if(targetCapacity<toULimit) {
688         /*
689          * Not enough target capacity to output the partial character.
690          * Let the standard converter handle this.
691          */
692         *pErrorCode=U_USING_DEFAULT_WARNING;
693         return;
694     } else {
695         // Use a single counter for source and target, counting the minimum of
696         // the source length and the target capacity.
697         // Let the standard converter handle edge cases.
698         const uint8_t *limit=sourceLimit;
699         if(count>targetCapacity) {
700             limit-=(count-targetCapacity);
701             count=targetCapacity;
702         }
703 
704         // The conversion loop checks count>0 only once per 1/2/3-byte character.
705         // If the buffer ends with a truncated 2- or 3-byte sequence,
706         // then we reduce the count to stop before that,
707         // and collect the remaining bytes after the conversion loop.
708         {
709             // Do not go back into the bytes that will be read for finishing a partial
710             // sequence from the previous buffer.
711             int32_t length=count-toULimit;
712             if(length>0) {
713                 uint8_t b1=*(limit-1);
714                 if(U8_IS_SINGLE(b1)) {
715                     // common ASCII character
716                 } else if(U8_IS_TRAIL(b1) && length>=2) {
717                     uint8_t b2=*(limit-2);
718                     if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
719                         // truncated 3-byte sequence
720                         count-=2;
721                     }
722                 } else if(0xc2<=b1 && b1<0xf0) {
723                     // truncated 2- or 3-byte sequence
724                     --count;
725                 }
726             }
727         }
728     }
729 
730     if(c!=0) {
731         utf8->toUnicodeStatus=0;
732         utf8->toULength=0;
733         goto moreBytes;
734         /* See note in ucnv_SBCSFromUTF8() about this goto. */
735     }
736 
737     /* conversion loop */
738     while(count>0) {
739         b=*source++;
740         if(U8_IS_SINGLE(b)) {
741             /* convert ASCII */
742             *target++=b;
743             --count;
744             continue;
745         } else {
746             if(b>=0xe0) {
747                 if( /* handle U+0800..U+FFFF inline */
748                     b<0xf0 &&
749                     U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
750                     U8_IS_TRAIL(t2=source[1])
751                 ) {
752                     source+=2;
753                     *target++=b;
754                     *target++=t1;
755                     *target++=t2;
756                     count-=3;
757                     continue;
758                 }
759             } else {
760                 if( /* handle U+0080..U+07FF inline */
761                     b>=0xc2 &&
762                     U8_IS_TRAIL(t1=*source)
763                 ) {
764                     ++source;
765                     *target++=b;
766                     *target++=t1;
767                     count-=2;
768                     continue;
769                 }
770             }
771 
772             /* handle "complicated" and error cases, and continuing partial characters */
773             oldToULength=0;
774             toULength=1;
775             toULimit=U8_COUNT_BYTES_NON_ASCII(b);
776             c=b;
777 moreBytes:
778             while(toULength<toULimit) {
779                 if(source<sourceLimit) {
780                     b=*source;
781                     if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
782                         ++source;
783                         ++toULength;
784                         c=(c<<6)+b;
785                     } else {
786                         break; /* sequence too short, stop with toULength<toULimit */
787                     }
788                 } else {
789                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
790                     source-=(toULength-oldToULength);
791                     while(oldToULength<toULength) {
792                         utf8->toUBytes[oldToULength++]=*source++;
793                     }
794                     utf8->toUnicodeStatus=c;
795                     utf8->toULength=toULength;
796                     utf8->mode=toULimit;
797                     pToUArgs->source=(char *)source;
798                     pFromUArgs->target=(char *)target;
799                     return;
800                 }
801             }
802 
803             if(toULength!=toULimit) {
804                 /* error handling: illegal UTF-8 byte sequence */
805                 source-=(toULength-oldToULength);
806                 while(oldToULength<toULength) {
807                     utf8->toUBytes[oldToULength++]=*source++;
808                 }
809                 utf8->toULength=toULength;
810                 pToUArgs->source=(char *)source;
811                 pFromUArgs->target=(char *)target;
812                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
813                 return;
814             }
815 
816             /* copy the legal byte sequence to the target */
817             if(count>=toULength) {
818                 int8_t i;
819 
820                 for(i=0; i<oldToULength; ++i) {
821                     *target++=utf8->toUBytes[i];
822                 }
823                 source-=(toULength-oldToULength);
824                 for(; i<toULength; ++i) {
825                     *target++=*source++;
826                 }
827                 count-=toULength;
828             } else {
829                 // A supplementary character that does not fit into the target.
830                 // Let the standard converter handle this.
831                 source-=(toULength-oldToULength);
832                 pToUArgs->source=(char *)source;
833                 pFromUArgs->target=(char *)target;
834                 *pErrorCode=U_USING_DEFAULT_WARNING;
835                 return;
836             }
837         }
838     }
839     U_ASSERT(count>=0);
840 
841     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
842         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
843             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
844         } else {
845             b=*source;
846             toULimit=U8_COUNT_BYTES(b);
847             if(toULimit>(sourceLimit-source)) {
848                 /* collect a truncated byte sequence */
849                 toULength=0;
850                 c=b;
851                 for(;;) {
852                     utf8->toUBytes[toULength++]=b;
853                     if(++source==sourceLimit) {
854                         /* partial byte sequence at end of source */
855                         utf8->toUnicodeStatus=c;
856                         utf8->toULength=toULength;
857                         utf8->mode=toULimit;
858                         break;
859                     } else if(!U8_IS_TRAIL(b=*source)) {
860                         /* lead byte in trail byte position */
861                         utf8->toULength=toULength;
862                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
863                         break;
864                     }
865                     c=(c<<6)+b;
866                 }
867             } else {
868                 /* partial-sequence target overflow: fall back to the pivoting implementation */
869                 *pErrorCode=U_USING_DEFAULT_WARNING;
870             }
871         }
872     }
873 
874     /* write back the updated pointers */
875     pToUArgs->source=(char *)source;
876     pFromUArgs->target=(char *)target;
877 }
878 
879 U_CDECL_END
880 
881 /* UTF-8 converter data ----------------------------------------------------- */
882 
883 static const UConverterImpl _UTF8Impl={
884     UCNV_UTF8,
885 
886     NULL,
887     NULL,
888 
889     NULL,
890     NULL,
891     NULL,
892 
893     ucnv_toUnicode_UTF8,
894     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
895     ucnv_fromUnicode_UTF8,
896     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
897     ucnv_getNextUChar_UTF8,
898 
899     NULL,
900     NULL,
901     NULL,
902     NULL,
903     ucnv_getNonSurrogateUnicodeSet,
904 
905     ucnv_UTF8FromUTF8,
906     ucnv_UTF8FromUTF8
907 };
908 
909 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
910 static const UConverterStaticData _UTF8StaticData={
911     sizeof(UConverterStaticData),
912     "UTF-8",
913     1208, UCNV_IBM, UCNV_UTF8,
914     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
915     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
916     0,
917     0,
918     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
919 };
920 
921 
922 const UConverterSharedData _UTF8Data=
923         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
924 
925 /* CESU-8 converter data ---------------------------------------------------- */
926 
927 static const UConverterImpl _CESU8Impl={
928     UCNV_CESU8,
929 
930     NULL,
931     NULL,
932 
933     NULL,
934     NULL,
935     NULL,
936 
937     ucnv_toUnicode_UTF8,
938     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
939     ucnv_fromUnicode_UTF8,
940     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
941     NULL,
942 
943     NULL,
944     NULL,
945     NULL,
946     NULL,
947     ucnv_getCompleteUnicodeSet,
948 
949     NULL,
950     NULL
951 };
952 
953 static const UConverterStaticData _CESU8StaticData={
954     sizeof(UConverterStaticData),
955     "CESU-8",
956     9400, /* CCSID for CESU-8 */
957     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
958     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
959     0,
960     0,
961     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
962 };
963 
964 
965 const UConverterSharedData _CESU8Data=
966         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
967 
968 #endif
969