• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u8.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 *   Also, CESU-8 implementation, see UTR 26.
17 *   The CESU-8 converter uses all the same functions as the
18 *   UTF-8 converter, with a branch for converting supplementary code points.
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_CONVERSION
24 
25 #include "unicode/ucnv.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
29 #include "ucnv_bld.h"
30 #include "ucnv_cnv.h"
31 #include "cmemory.h"
32 
33 /* Prototypes --------------------------------------------------------------- */
34 
35 /* Keep these here to make finicky compilers happy */
36 
37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
38                                            UErrorCode *err);
39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
40                                                         UErrorCode *err);
41 
42 
43 /* UTF-8 -------------------------------------------------------------------- */
44 
45 /* UTF-8 Conversion DATA
46  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
47  */
48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49 #define MAXIMUM_UCS2            0x0000FFFF
50 #define MAXIMUM_UTF             0x0010FFFF
51 #define MAXIMUM_UCS4            0x7FFFFFFF
52 #define HALF_SHIFT              10
53 #define HALF_BASE               0x0010000
54 #define HALF_MASK               0x3FF
55 #define SURROGATE_HIGH_START    0xD800
56 #define SURROGATE_HIGH_END      0xDBFF
57 #define SURROGATE_LOW_START     0xDC00
58 #define SURROGATE_LOW_END       0xDFFF
59 
60 /* -SURROGATE_LOW_START + HALF_BASE */
61 #define SURROGATE_LOW_BASE      9216
62 
63 static const uint32_t offsetsFromUTF8[7] = {0,
64   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66 };
67 
68 /* END OF UTF-8 Conversion DATA */
69 
70 static const int8_t bytesFromUTF8[256] = {
71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79 };
80 
81 /*
82  * Starting with Unicode 3.0.1:
83  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84  * byte sequences with more than 4 bytes are illegal in UTF-8,
85  * which is tested with impossible values for them
86  */
87 static const uint32_t
88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89 
hasCESU8Data(const UConverter * cnv)90 static UBool hasCESU8Data(const UConverter *cnv)
91 {
92 #if UCONFIG_NO_NON_HTML5_CONVERSION
93     return FALSE;
94 #else
95     return (UBool)(cnv->sharedData == &_CESU8Data);
96 #endif
97 }
98 
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)99 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
100                                   UErrorCode * err)
101 {
102     UConverter *cnv = args->converter;
103     const unsigned char *mySource = (unsigned char *) args->source;
104     UChar *myTarget = args->target;
105     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
106     const UChar *targetLimit = args->targetLimit;
107     unsigned char *toUBytes = cnv->toUBytes;
108     UBool isCESU8 = hasCESU8Data(cnv);
109     uint32_t ch, ch2 = 0;
110     int32_t i, inBytes;
111 
112     /* Restore size of current sequence */
113     if (cnv->toUnicodeStatus && myTarget < targetLimit)
114     {
115         inBytes = cnv->mode;            /* restore # of bytes to consume */
116         i = cnv->toULength;             /* restore # of bytes consumed */
117         cnv->toULength = 0;
118 
119         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
120         cnv->toUnicodeStatus = 0;
121         goto morebytes;
122     }
123 
124 
125     while (mySource < sourceLimit && myTarget < targetLimit)
126     {
127         ch = *(mySource++);
128         if (ch < 0x80)        /* Simple case */
129         {
130             *(myTarget++) = (UChar) ch;
131         }
132         else
133         {
134             /* store the first char */
135             toUBytes[0] = (char)ch;
136             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
137             i = 1;
138 
139 morebytes:
140             while (i < inBytes)
141             {
142                 if (mySource < sourceLimit)
143                 {
144                     toUBytes[i] = (char) (ch2 = *mySource);
145                     if (!U8_IS_TRAIL(ch2))
146                     {
147                         break; /* i < inBytes */
148                     }
149                     ch = (ch << 6) + ch2;
150                     ++mySource;
151                     i++;
152                 }
153                 else
154                 {
155                     /* stores a partially calculated target*/
156                     cnv->toUnicodeStatus = ch;
157                     cnv->mode = inBytes;
158                     cnv->toULength = (int8_t) i;
159                     goto donefornow;
160                 }
161             }
162 
163             /* Remove the accumulated high bits */
164             ch -= offsetsFromUTF8[inBytes];
165 
166             /*
167              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
168              * - use only trail bytes after a lead byte (checked above)
169              * - use the right number of trail bytes for a given lead byte
170              * - encode a code point <= U+10ffff
171              * - use the fewest possible number of bytes for their code points
172              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
173              *
174              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
175              * There are no irregular sequences any more.
176              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
177              */
178             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
179                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
180             {
181                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
182                 if (ch <= MAXIMUM_UCS2)
183                 {
184                     /* fits in 16 bits */
185                     *(myTarget++) = (UChar) ch;
186                 }
187                 else
188                 {
189                     /* write out the surrogates */
190                     ch -= HALF_BASE;
191                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
192                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
193                     if (myTarget < targetLimit)
194                     {
195                         *(myTarget++) = (UChar)ch;
196                     }
197                     else
198                     {
199                         /* Put in overflow buffer (not handled here) */
200                         cnv->UCharErrorBuffer[0] = (UChar) ch;
201                         cnv->UCharErrorBufferLength = 1;
202                         *err = U_BUFFER_OVERFLOW_ERROR;
203                         break;
204                     }
205                 }
206             }
207             else
208             {
209                 cnv->toULength = (int8_t)i;
210                 *err = U_ILLEGAL_CHAR_FOUND;
211                 break;
212             }
213         }
214     }
215 
216 donefornow:
217     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
218     {
219         /* End of target buffer */
220         *err = U_BUFFER_OVERFLOW_ERROR;
221     }
222 
223     args->target = myTarget;
224     args->source = (const char *) mySource;
225 }
226 
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)227 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
228                                                 UErrorCode * err)
229 {
230     UConverter *cnv = args->converter;
231     const unsigned char *mySource = (unsigned char *) args->source;
232     UChar *myTarget = args->target;
233     int32_t *myOffsets = args->offsets;
234     int32_t offsetNum = 0;
235     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
236     const UChar *targetLimit = args->targetLimit;
237     unsigned char *toUBytes = cnv->toUBytes;
238     UBool isCESU8 = hasCESU8Data(cnv);
239     uint32_t ch, ch2 = 0;
240     int32_t i, inBytes;
241 
242     /* Restore size of current sequence */
243     if (cnv->toUnicodeStatus && myTarget < targetLimit)
244     {
245         inBytes = cnv->mode;            /* restore # of bytes to consume */
246         i = cnv->toULength;             /* restore # of bytes consumed */
247         cnv->toULength = 0;
248 
249         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
250         cnv->toUnicodeStatus = 0;
251         goto morebytes;
252     }
253 
254     while (mySource < sourceLimit && myTarget < targetLimit)
255     {
256         ch = *(mySource++);
257         if (ch < 0x80)        /* Simple case */
258         {
259             *(myTarget++) = (UChar) ch;
260             *(myOffsets++) = offsetNum++;
261         }
262         else
263         {
264             toUBytes[0] = (char)ch;
265             inBytes = bytesFromUTF8[ch];
266             i = 1;
267 
268 morebytes:
269             while (i < inBytes)
270             {
271                 if (mySource < sourceLimit)
272                 {
273                     toUBytes[i] = (char) (ch2 = *mySource);
274                     if (!U8_IS_TRAIL(ch2))
275                     {
276                         break; /* i < inBytes */
277                     }
278                     ch = (ch << 6) + ch2;
279                     ++mySource;
280                     i++;
281                 }
282                 else
283                 {
284                     cnv->toUnicodeStatus = ch;
285                     cnv->mode = inBytes;
286                     cnv->toULength = (int8_t)i;
287                     goto donefornow;
288                 }
289             }
290 
291             /* Remove the accumulated high bits */
292             ch -= offsetsFromUTF8[inBytes];
293 
294             /*
295              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
296              * - use only trail bytes after a lead byte (checked above)
297              * - use the right number of trail bytes for a given lead byte
298              * - encode a code point <= U+10ffff
299              * - use the fewest possible number of bytes for their code points
300              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
301              *
302              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
303              * There are no irregular sequences any more.
304              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
305              */
306             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
307                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
308             {
309                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
310                 if (ch <= MAXIMUM_UCS2)
311                 {
312                     /* fits in 16 bits */
313                     *(myTarget++) = (UChar) ch;
314                     *(myOffsets++) = offsetNum;
315                 }
316                 else
317                 {
318                     /* write out the surrogates */
319                     ch -= HALF_BASE;
320                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
321                     *(myOffsets++) = offsetNum;
322                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
323                     if (myTarget < targetLimit)
324                     {
325                         *(myTarget++) = (UChar)ch;
326                         *(myOffsets++) = offsetNum;
327                     }
328                     else
329                     {
330                         cnv->UCharErrorBuffer[0] = (UChar) ch;
331                         cnv->UCharErrorBufferLength = 1;
332                         *err = U_BUFFER_OVERFLOW_ERROR;
333                     }
334                 }
335                 offsetNum += i;
336             }
337             else
338             {
339                 cnv->toULength = (int8_t)i;
340                 *err = U_ILLEGAL_CHAR_FOUND;
341                 break;
342             }
343         }
344     }
345 
346 donefornow:
347     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
348     {   /* End of target buffer */
349         *err = U_BUFFER_OVERFLOW_ERROR;
350     }
351 
352     args->target = myTarget;
353     args->source = (const char *) mySource;
354     args->offsets = myOffsets;
355 }
356 
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)357 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
358                                     UErrorCode * err)
359 {
360     UConverter *cnv = args->converter;
361     const UChar *mySource = args->source;
362     const UChar *sourceLimit = args->sourceLimit;
363     uint8_t *myTarget = (uint8_t *) args->target;
364     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
365     uint8_t *tempPtr;
366     UChar32 ch;
367     uint8_t tempBuf[4];
368     int32_t indexToWrite;
369     UBool isNotCESU8 = !hasCESU8Data(cnv);
370 
371     if (cnv->fromUChar32 && myTarget < targetLimit)
372     {
373         ch = cnv->fromUChar32;
374         cnv->fromUChar32 = 0;
375         goto lowsurrogate;
376     }
377 
378     while (mySource < sourceLimit && myTarget < targetLimit)
379     {
380         ch = *(mySource++);
381 
382         if (ch < 0x80)        /* Single byte */
383         {
384             *(myTarget++) = (uint8_t) ch;
385         }
386         else if (ch < 0x800)  /* Double byte */
387         {
388             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
389             if (myTarget < targetLimit)
390             {
391                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
392             }
393             else
394             {
395                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
396                 cnv->charErrorBufferLength = 1;
397                 *err = U_BUFFER_OVERFLOW_ERROR;
398             }
399         }
400         else {
401             /* Check for surrogates */
402             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
403 lowsurrogate:
404                 if (mySource < sourceLimit) {
405                     /* test both code units */
406                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
407                         /* convert and consume this supplementary code point */
408                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
409                         ++mySource;
410                         /* exit this condition tree */
411                     }
412                     else {
413                         /* this is an unpaired trail or lead code unit */
414                         /* callback(illegal) */
415                         cnv->fromUChar32 = ch;
416                         *err = U_ILLEGAL_CHAR_FOUND;
417                         break;
418                     }
419                 }
420                 else {
421                     /* no more input */
422                     cnv->fromUChar32 = ch;
423                     break;
424                 }
425             }
426 
427             /* Do we write the buffer directly for speed,
428             or do we have to be careful about target buffer space? */
429             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
430 
431             if (ch <= MAXIMUM_UCS2) {
432                 indexToWrite = 2;
433                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
434             }
435             else {
436                 indexToWrite = 3;
437                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
438                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
439             }
440             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
441             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
442 
443             if (tempPtr == myTarget) {
444                 /* There was enough space to write the codepoint directly. */
445                 myTarget += (indexToWrite + 1);
446             }
447             else {
448                 /* We might run out of room soon. Write it slowly. */
449                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
450                     if (myTarget < targetLimit) {
451                         *(myTarget++) = *tempPtr;
452                     }
453                     else {
454                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
455                         *err = U_BUFFER_OVERFLOW_ERROR;
456                     }
457                 }
458             }
459         }
460     }
461 
462     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
463     {
464         *err = U_BUFFER_OVERFLOW_ERROR;
465     }
466 
467     args->target = (char *) myTarget;
468     args->source = mySource;
469 }
470 
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)471 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
472                                                   UErrorCode * err)
473 {
474     UConverter *cnv = args->converter;
475     const UChar *mySource = args->source;
476     int32_t *myOffsets = args->offsets;
477     const UChar *sourceLimit = args->sourceLimit;
478     uint8_t *myTarget = (uint8_t *) args->target;
479     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
480     uint8_t *tempPtr;
481     UChar32 ch;
482     int32_t offsetNum, nextSourceIndex;
483     int32_t indexToWrite;
484     uint8_t tempBuf[4];
485     UBool isNotCESU8 = !hasCESU8Data(cnv);
486 
487     if (cnv->fromUChar32 && myTarget < targetLimit)
488     {
489         ch = cnv->fromUChar32;
490         cnv->fromUChar32 = 0;
491         offsetNum = -1;
492         nextSourceIndex = 0;
493         goto lowsurrogate;
494     } else {
495         offsetNum = 0;
496     }
497 
498     while (mySource < sourceLimit && myTarget < targetLimit)
499     {
500         ch = *(mySource++);
501 
502         if (ch < 0x80)        /* Single byte */
503         {
504             *(myOffsets++) = offsetNum++;
505             *(myTarget++) = (char) ch;
506         }
507         else if (ch < 0x800)  /* Double byte */
508         {
509             *(myOffsets++) = offsetNum;
510             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
511             if (myTarget < targetLimit)
512             {
513                 *(myOffsets++) = offsetNum++;
514                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
515             }
516             else
517             {
518                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
519                 cnv->charErrorBufferLength = 1;
520                 *err = U_BUFFER_OVERFLOW_ERROR;
521             }
522         }
523         else
524         /* Check for surrogates */
525         {
526             nextSourceIndex = offsetNum + 1;
527 
528             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
529 lowsurrogate:
530                 if (mySource < sourceLimit) {
531                     /* test both code units */
532                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
533                         /* convert and consume this supplementary code point */
534                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
535                         ++mySource;
536                         ++nextSourceIndex;
537                         /* exit this condition tree */
538                     }
539                     else {
540                         /* this is an unpaired trail or lead code unit */
541                         /* callback(illegal) */
542                         cnv->fromUChar32 = ch;
543                         *err = U_ILLEGAL_CHAR_FOUND;
544                         break;
545                     }
546                 }
547                 else {
548                     /* no more input */
549                     cnv->fromUChar32 = ch;
550                     break;
551                 }
552             }
553 
554             /* Do we write the buffer directly for speed,
555             or do we have to be careful about target buffer space? */
556             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
557 
558             if (ch <= MAXIMUM_UCS2) {
559                 indexToWrite = 2;
560                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
561             }
562             else {
563                 indexToWrite = 3;
564                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
565                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
566             }
567             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
568             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
569 
570             if (tempPtr == myTarget) {
571                 /* There was enough space to write the codepoint directly. */
572                 myTarget += (indexToWrite + 1);
573                 myOffsets[0] = offsetNum;
574                 myOffsets[1] = offsetNum;
575                 myOffsets[2] = offsetNum;
576                 if (indexToWrite >= 3) {
577                     myOffsets[3] = offsetNum;
578                 }
579                 myOffsets += (indexToWrite + 1);
580             }
581             else {
582                 /* We might run out of room soon. Write it slowly. */
583                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
584                     if (myTarget < targetLimit)
585                     {
586                         *(myOffsets++) = offsetNum;
587                         *(myTarget++) = *tempPtr;
588                     }
589                     else
590                     {
591                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
592                         *err = U_BUFFER_OVERFLOW_ERROR;
593                     }
594                 }
595             }
596             offsetNum = nextSourceIndex;
597         }
598     }
599 
600     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
601     {
602         *err = U_BUFFER_OVERFLOW_ERROR;
603     }
604 
605     args->target = (char *) myTarget;
606     args->source = mySource;
607     args->offsets = myOffsets;
608 }
609 
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)610 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
611                                                UErrorCode *err) {
612     UConverter *cnv;
613     const uint8_t *sourceInitial;
614     const uint8_t *source;
615     uint16_t extraBytesToWrite;
616     uint8_t myByte;
617     UChar32 ch;
618     int8_t i, isLegalSequence;
619 
620     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
621 
622     cnv = args->converter;
623     sourceInitial = source = (const uint8_t *)args->source;
624     if (source >= (const uint8_t *)args->sourceLimit)
625     {
626         /* no input */
627         *err = U_INDEX_OUTOFBOUNDS_ERROR;
628         return 0xffff;
629     }
630 
631     myByte = (uint8_t)*(source++);
632     if (myByte < 0x80)
633     {
634         args->source = (const char *)source;
635         return (UChar32)myByte;
636     }
637 
638     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
639     if (extraBytesToWrite == 0) {
640         cnv->toUBytes[0] = myByte;
641         cnv->toULength = 1;
642         *err = U_ILLEGAL_CHAR_FOUND;
643         args->source = (const char *)source;
644         return 0xffff;
645     }
646 
647     /*The byte sequence is longer than the buffer area passed*/
648     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
649     {
650         /* check if all of the remaining bytes are trail bytes */
651         cnv->toUBytes[0] = myByte;
652         i = 1;
653         *err = U_TRUNCATED_CHAR_FOUND;
654         while(source < (const uint8_t *)args->sourceLimit) {
655             if(U8_IS_TRAIL(myByte = *source)) {
656                 cnv->toUBytes[i++] = myByte;
657                 ++source;
658             } else {
659                 /* error even before we run out of input */
660                 *err = U_ILLEGAL_CHAR_FOUND;
661                 break;
662             }
663         }
664         cnv->toULength = i;
665         args->source = (const char *)source;
666         return 0xffff;
667     }
668 
669     isLegalSequence = 1;
670     ch = myByte << 6;
671     switch(extraBytesToWrite)
672     {
673       /* note: code falls through cases! (sic)*/
674     case 6:
675         ch += (myByte = *source);
676         ch <<= 6;
677         if (!U8_IS_TRAIL(myByte))
678         {
679             isLegalSequence = 0;
680             break;
681         }
682         ++source;
683     case 5: /*fall through*/
684         ch += (myByte = *source);
685         ch <<= 6;
686         if (!U8_IS_TRAIL(myByte))
687         {
688             isLegalSequence = 0;
689             break;
690         }
691         ++source;
692     case 4: /*fall through*/
693         ch += (myByte = *source);
694         ch <<= 6;
695         if (!U8_IS_TRAIL(myByte))
696         {
697             isLegalSequence = 0;
698             break;
699         }
700         ++source;
701     case 3: /*fall through*/
702         ch += (myByte = *source);
703         ch <<= 6;
704         if (!U8_IS_TRAIL(myByte))
705         {
706             isLegalSequence = 0;
707             break;
708         }
709         ++source;
710     case 2: /*fall through*/
711         ch += (myByte = *source);
712         if (!U8_IS_TRAIL(myByte))
713         {
714             isLegalSequence = 0;
715             break;
716         }
717         ++source;
718     };
719     ch -= offsetsFromUTF8[extraBytesToWrite];
720     args->source = (const char *)source;
721 
722     /*
723      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
724      * - use only trail bytes after a lead byte (checked above)
725      * - use the right number of trail bytes for a given lead byte
726      * - encode a code point <= U+10ffff
727      * - use the fewest possible number of bytes for their code points
728      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
729      *
730      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
731      * There are no irregular sequences any more.
732      */
733     if (isLegalSequence &&
734         (uint32_t)ch <= MAXIMUM_UTF &&
735         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
736         !U_IS_SURROGATE(ch)
737     ) {
738         return ch; /* return the code point */
739     }
740 
741     for(i = 0; sourceInitial < source; ++i) {
742         cnv->toUBytes[i] = *sourceInitial++;
743     }
744     cnv->toULength = i;
745     *err = U_ILLEGAL_CHAR_FOUND;
746     return 0xffff;
747 }
748 
749 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
750 
751 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
752 static const UChar32
753 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
754 
755 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
756 static const UChar32
757 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
758 
759 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
760 static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)761 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
762                   UConverterToUnicodeArgs *pToUArgs,
763                   UErrorCode *pErrorCode) {
764     UConverter *utf8;
765     const uint8_t *source, *sourceLimit;
766     uint8_t *target;
767     int32_t targetCapacity;
768     int32_t count;
769 
770     int8_t oldToULength, toULength, toULimit;
771 
772     UChar32 c;
773     uint8_t b, t1, t2;
774 
775     /* set up the local pointers */
776     utf8=pToUArgs->converter;
777     source=(uint8_t *)pToUArgs->source;
778     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
779     target=(uint8_t *)pFromUArgs->target;
780     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
781 
782     /* get the converter state from the UTF-8 UConverter */
783     c=(UChar32)utf8->toUnicodeStatus;
784     if(c!=0) {
785         toULength=oldToULength=utf8->toULength;
786         toULimit=(int8_t)utf8->mode;
787     } else {
788         toULength=oldToULength=toULimit=0;
789     }
790 
791     count=(int32_t)(sourceLimit-source)+oldToULength;
792     if(count<toULimit) {
793         /*
794          * Not enough input to complete the partial character.
795          * Jump to moreBytes below - it will not output to target.
796          */
797     } else if(targetCapacity<toULimit) {
798         /*
799          * Not enough target capacity to output the partial character.
800          * Let the standard converter handle this.
801          */
802         *pErrorCode=U_USING_DEFAULT_WARNING;
803         return;
804     } else {
805         /*
806          * Use a single counter for source and target, counting the minimum of
807          * the source length and the target capacity.
808          * As a result, the source length is checked only once per multi-byte
809          * character instead of twice.
810          *
811          * Make sure that the last byte sequence is complete, or else
812          * stop just before it.
813          * (The longest legal byte sequence has 3 trail bytes.)
814          * Count oldToULength (number of source bytes from a previous buffer)
815          * into the source length but reduce the source index by toULimit
816          * while going back over trail bytes in order to not go back into
817          * the bytes that will be read for finishing a partial
818          * sequence from the previous buffer.
819          * Let the standard converter handle edge cases.
820          */
821         int32_t i;
822 
823         if(count>targetCapacity) {
824             count=targetCapacity;
825         }
826 
827         i=0;
828         while(i<3 && i<(count-toULimit)) {
829             b=source[count-oldToULength-i-1];
830             if(U8_IS_TRAIL(b)) {
831                 ++i;
832             } else {
833                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
834                     /* stop converting before the lead byte if there are not enough trail bytes for it */
835                     count-=i+1;
836                 }
837                 break;
838             }
839         }
840     }
841 
842     if(c!=0) {
843         utf8->toUnicodeStatus=0;
844         utf8->toULength=0;
845         goto moreBytes;
846         /* See note in ucnv_SBCSFromUTF8() about this goto. */
847     }
848 
849     /* conversion loop */
850     while(count>0) {
851         b=*source++;
852         if((int8_t)b>=0) {
853             /* convert ASCII */
854             *target++=b;
855             --count;
856             continue;
857         } else {
858             if(b>0xe0) {
859                 if( /* handle U+1000..U+D7FF inline */
860                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
861                                                (b==0xed && (t1 <= 0x9f))) &&
862                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
863                 ) {
864                     source+=2;
865                     *target++=b;
866                     *target++=t1;
867                     *target++=t2;
868                     count-=3;
869                     continue;
870                 }
871             } else if(b<0xe0) {
872                 if( /* handle U+0080..U+07FF inline */
873                     b>=0xc2 &&
874                     (t1=*source) >= 0x80 && t1 <= 0xbf
875                 ) {
876                     ++source;
877                     *target++=b;
878                     *target++=t1;
879                     count-=2;
880                     continue;
881                 }
882             } else if(b==0xe0) {
883                 if( /* handle U+0800..U+0FFF inline */
884                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
885                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
886                 ) {
887                     source+=2;
888                     *target++=b;
889                     *target++=t1;
890                     *target++=t2;
891                     count-=3;
892                     continue;
893                 }
894             }
895 
896             /* handle "complicated" and error cases, and continuing partial characters */
897             oldToULength=0;
898             toULength=1;
899             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
900             c=b;
901 moreBytes:
902             while(toULength<toULimit) {
903                 if(source<sourceLimit) {
904                     b=*source;
905                     if(U8_IS_TRAIL(b)) {
906                         ++source;
907                         ++toULength;
908                         c=(c<<6)+b;
909                     } else {
910                         break; /* sequence too short, stop with toULength<toULimit */
911                     }
912                 } else {
913                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
914                     source-=(toULength-oldToULength);
915                     while(oldToULength<toULength) {
916                         utf8->toUBytes[oldToULength++]=*source++;
917                     }
918                     utf8->toUnicodeStatus=c;
919                     utf8->toULength=toULength;
920                     utf8->mode=toULimit;
921                     pToUArgs->source=(char *)source;
922                     pFromUArgs->target=(char *)target;
923                     return;
924                 }
925             }
926 
927             if( toULength==toULimit &&      /* consumed all trail bytes */
928                 (toULength==3 || toULength==2) &&             /* BMP */
929                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
930                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
931             ) {
932                 /* legal byte sequence for BMP code point */
933             } else if(
934                 toULength==toULimit && toULength==4 &&
935                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
936             ) {
937                 /* legal byte sequence for supplementary code point */
938             } else {
939                 /* error handling: illegal UTF-8 byte sequence */
940                 source-=(toULength-oldToULength);
941                 while(oldToULength<toULength) {
942                     utf8->toUBytes[oldToULength++]=*source++;
943                 }
944                 utf8->toULength=toULength;
945                 pToUArgs->source=(char *)source;
946                 pFromUArgs->target=(char *)target;
947                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
948                 return;
949             }
950 
951             /* copy the legal byte sequence to the target */
952             {
953                 int8_t i;
954 
955                 for(i=0; i<oldToULength; ++i) {
956                     *target++=utf8->toUBytes[i];
957                 }
958                 source-=(toULength-oldToULength);
959                 for(; i<toULength; ++i) {
960                     *target++=*source++;
961                 }
962                 count-=toULength;
963             }
964         }
965     }
966 
967     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
968         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
969             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
970         } else {
971             b=*source;
972             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
973             if(toULimit>(sourceLimit-source)) {
974                 /* collect a truncated byte sequence */
975                 toULength=0;
976                 c=b;
977                 for(;;) {
978                     utf8->toUBytes[toULength++]=b;
979                     if(++source==sourceLimit) {
980                         /* partial byte sequence at end of source */
981                         utf8->toUnicodeStatus=c;
982                         utf8->toULength=toULength;
983                         utf8->mode=toULimit;
984                         break;
985                     } else if(!U8_IS_TRAIL(b=*source)) {
986                         /* lead byte in trail byte position */
987                         utf8->toULength=toULength;
988                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
989                         break;
990                     }
991                     c=(c<<6)+b;
992                 }
993             } else {
994                 /* partial-sequence target overflow: fall back to the pivoting implementation */
995                 *pErrorCode=U_USING_DEFAULT_WARNING;
996             }
997         }
998     }
999 
1000     /* write back the updated pointers */
1001     pToUArgs->source=(char *)source;
1002     pFromUArgs->target=(char *)target;
1003 }
1004 
1005 /* UTF-8 converter data ----------------------------------------------------- */
1006 
1007 static const UConverterImpl _UTF8Impl={
1008     UCNV_UTF8,
1009 
1010     NULL,
1011     NULL,
1012 
1013     NULL,
1014     NULL,
1015     NULL,
1016 
1017     ucnv_toUnicode_UTF8,
1018     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1019     ucnv_fromUnicode_UTF8,
1020     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1021     ucnv_getNextUChar_UTF8,
1022 
1023     NULL,
1024     NULL,
1025     NULL,
1026     NULL,
1027     ucnv_getNonSurrogateUnicodeSet,
1028 
1029     ucnv_UTF8FromUTF8,
1030     ucnv_UTF8FromUTF8
1031 };
1032 
1033 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1034 static const UConverterStaticData _UTF8StaticData={
1035     sizeof(UConverterStaticData),
1036     "UTF-8",
1037     1208, UCNV_IBM, UCNV_UTF8,
1038     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1039     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1040     0,
1041     0,
1042     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1043 };
1044 
1045 
1046 const UConverterSharedData _UTF8Data={
1047     sizeof(UConverterSharedData), ~((uint32_t) 0),
1048     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1049     0
1050 };
1051 
1052 /* CESU-8 converter data ---------------------------------------------------- */
1053 
1054 static const UConverterImpl _CESU8Impl={
1055     UCNV_CESU8,
1056 
1057     NULL,
1058     NULL,
1059 
1060     NULL,
1061     NULL,
1062     NULL,
1063 
1064     ucnv_toUnicode_UTF8,
1065     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1066     ucnv_fromUnicode_UTF8,
1067     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1068     NULL,
1069 
1070     NULL,
1071     NULL,
1072     NULL,
1073     NULL,
1074     ucnv_getCompleteUnicodeSet
1075 };
1076 
1077 static const UConverterStaticData _CESU8StaticData={
1078     sizeof(UConverterStaticData),
1079     "CESU-8",
1080     9400, /* CCSID for CESU-8 */
1081     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1082     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1083     0,
1084     0,
1085     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1086 };
1087 
1088 
1089 const UConverterSharedData _CESU8Data={
1090     sizeof(UConverterSharedData), ~((uint32_t) 0),
1091     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1092     0
1093 };
1094 
1095 #endif
1096