• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u8.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 *   Also, CESU-8 implementation, see UTR 26.
17 *   The CESU-8 converter uses all the same functions as the
18 *   UTF-8 converter, with a branch for converting supplementary code points.
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_CONVERSION
24 
25 #include "unicode/ucnv.h"
26 #include "ucnv_bld.h"
27 #include "ucnv_cnv.h"
28 #include "cmemory.h"
29 
30 /* Prototypes --------------------------------------------------------------- */
31 
32 /* Keep these here to make finicky compilers happy */
33 
34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
35                                            UErrorCode *err);
36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
37                                                         UErrorCode *err);
38 
39 
40 /* UTF-8 -------------------------------------------------------------------- */
41 
42 /* UTF-8 Conversion DATA
43  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
44  */
45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46 #define MAXIMUM_UCS2            0x0000FFFF
47 #define MAXIMUM_UTF             0x0010FFFF
48 #define MAXIMUM_UCS4            0x7FFFFFFF
49 #define HALF_SHIFT              10
50 #define HALF_BASE               0x0010000
51 #define HALF_MASK               0x3FF
52 #define SURROGATE_HIGH_START    0xD800
53 #define SURROGATE_HIGH_END      0xDBFF
54 #define SURROGATE_LOW_START     0xDC00
55 #define SURROGATE_LOW_END       0xDFFF
56 
57 /* -SURROGATE_LOW_START + HALF_BASE */
58 #define SURROGATE_LOW_BASE      9216
59 
60 static const uint32_t offsetsFromUTF8[7] = {0,
61   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
63 };
64 
65 /* END OF UTF-8 Conversion DATA */
66 
67 static const int8_t bytesFromUTF8[256] = {
68   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
76 };
77 
78 /*
79  * Starting with Unicode 3.0.1:
80  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81  * byte sequences with more than 4 bytes are illegal in UTF-8,
82  * which is tested with impossible values for them
83  */
84 static const uint32_t
85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
86 
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
88                                   UErrorCode * err)
89 {
90     UConverter *cnv = args->converter;
91     const unsigned char *mySource = (unsigned char *) args->source;
92     UChar *myTarget = args->target;
93     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
94     const UChar *targetLimit = args->targetLimit;
95     unsigned char *toUBytes = cnv->toUBytes;
96     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
97     uint32_t ch, ch2 = 0;
98     int32_t i, inBytes;
99 
100     /* Restore size of current sequence */
101     if (cnv->toUnicodeStatus && myTarget < targetLimit)
102     {
103         inBytes = cnv->mode;            /* restore # of bytes to consume */
104         i = cnv->toULength;             /* restore # of bytes consumed */
105         cnv->toULength = 0;
106 
107         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
108         cnv->toUnicodeStatus = 0;
109         goto morebytes;
110     }
111 
112 
113     while (mySource < sourceLimit && myTarget < targetLimit)
114     {
115         ch = *(mySource++);
116         if (ch < 0x80)        /* Simple case */
117         {
118             *(myTarget++) = (UChar) ch;
119         }
120         else
121         {
122             /* store the first char */
123             toUBytes[0] = (char)ch;
124             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
125             i = 1;
126 
127 morebytes:
128             while (i < inBytes)
129             {
130                 if (mySource < sourceLimit)
131                 {
132                     toUBytes[i] = (char) (ch2 = *mySource);
133                     if (!UTF8_IS_TRAIL(ch2))
134                     {
135                         break; /* i < inBytes */
136                     }
137                     ch = (ch << 6) + ch2;
138                     ++mySource;
139                     i++;
140                 }
141                 else
142                 {
143                     /* stores a partially calculated target*/
144                     cnv->toUnicodeStatus = ch;
145                     cnv->mode = inBytes;
146                     cnv->toULength = (int8_t) i;
147                     goto donefornow;
148                 }
149             }
150 
151             /* Remove the accumulated high bits */
152             ch -= offsetsFromUTF8[inBytes];
153 
154             /*
155              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
156              * - use only trail bytes after a lead byte (checked above)
157              * - use the right number of trail bytes for a given lead byte
158              * - encode a code point <= U+10ffff
159              * - use the fewest possible number of bytes for their code points
160              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
161              *
162              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
163              * There are no irregular sequences any more.
164              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
165              */
166             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
167                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
168             {
169                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
170                 if (ch <= MAXIMUM_UCS2)
171                 {
172                     /* fits in 16 bits */
173                     *(myTarget++) = (UChar) ch;
174                 }
175                 else
176                 {
177                     /* write out the surrogates */
178                     ch -= HALF_BASE;
179                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
180                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
181                     if (myTarget < targetLimit)
182                     {
183                         *(myTarget++) = (UChar)ch;
184                     }
185                     else
186                     {
187                         /* Put in overflow buffer (not handled here) */
188                         cnv->UCharErrorBuffer[0] = (UChar) ch;
189                         cnv->UCharErrorBufferLength = 1;
190                         *err = U_BUFFER_OVERFLOW_ERROR;
191                         break;
192                     }
193                 }
194             }
195             else
196             {
197                 cnv->toULength = (int8_t)i;
198                 *err = U_ILLEGAL_CHAR_FOUND;
199                 break;
200             }
201         }
202     }
203 
204 donefornow:
205     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
206     {
207         /* End of target buffer */
208         *err = U_BUFFER_OVERFLOW_ERROR;
209     }
210 
211     args->target = myTarget;
212     args->source = (const char *) mySource;
213 }
214 
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
216                                                 UErrorCode * err)
217 {
218     UConverter *cnv = args->converter;
219     const unsigned char *mySource = (unsigned char *) args->source;
220     UChar *myTarget = args->target;
221     int32_t *myOffsets = args->offsets;
222     int32_t offsetNum = 0;
223     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
224     const UChar *targetLimit = args->targetLimit;
225     unsigned char *toUBytes = cnv->toUBytes;
226     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
227     uint32_t ch, ch2 = 0;
228     int32_t i, inBytes;
229 
230     /* Restore size of current sequence */
231     if (cnv->toUnicodeStatus && myTarget < targetLimit)
232     {
233         inBytes = cnv->mode;            /* restore # of bytes to consume */
234         i = cnv->toULength;             /* restore # of bytes consumed */
235         cnv->toULength = 0;
236 
237         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
238         cnv->toUnicodeStatus = 0;
239         goto morebytes;
240     }
241 
242     while (mySource < sourceLimit && myTarget < targetLimit)
243     {
244         ch = *(mySource++);
245         if (ch < 0x80)        /* Simple case */
246         {
247             *(myTarget++) = (UChar) ch;
248             *(myOffsets++) = offsetNum++;
249         }
250         else
251         {
252             toUBytes[0] = (char)ch;
253             inBytes = bytesFromUTF8[ch];
254             i = 1;
255 
256 morebytes:
257             while (i < inBytes)
258             {
259                 if (mySource < sourceLimit)
260                 {
261                     toUBytes[i] = (char) (ch2 = *mySource);
262                     if (!UTF8_IS_TRAIL(ch2))
263                     {
264                         break; /* i < inBytes */
265                     }
266                     ch = (ch << 6) + ch2;
267                     ++mySource;
268                     i++;
269                 }
270                 else
271                 {
272                     cnv->toUnicodeStatus = ch;
273                     cnv->mode = inBytes;
274                     cnv->toULength = (int8_t)i;
275                     goto donefornow;
276                 }
277             }
278 
279             /* Remove the accumulated high bits */
280             ch -= offsetsFromUTF8[inBytes];
281 
282             /*
283              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
284              * - use only trail bytes after a lead byte (checked above)
285              * - use the right number of trail bytes for a given lead byte
286              * - encode a code point <= U+10ffff
287              * - use the fewest possible number of bytes for their code points
288              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
289              *
290              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
291              * There are no irregular sequences any more.
292              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
293              */
294             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
295                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
296             {
297                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
298                 if (ch <= MAXIMUM_UCS2)
299                 {
300                     /* fits in 16 bits */
301                     *(myTarget++) = (UChar) ch;
302                     *(myOffsets++) = offsetNum;
303                 }
304                 else
305                 {
306                     /* write out the surrogates */
307                     ch -= HALF_BASE;
308                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
309                     *(myOffsets++) = offsetNum;
310                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
311                     if (myTarget < targetLimit)
312                     {
313                         *(myTarget++) = (UChar)ch;
314                         *(myOffsets++) = offsetNum;
315                     }
316                     else
317                     {
318                         cnv->UCharErrorBuffer[0] = (UChar) ch;
319                         cnv->UCharErrorBufferLength = 1;
320                         *err = U_BUFFER_OVERFLOW_ERROR;
321                     }
322                 }
323                 offsetNum += i;
324             }
325             else
326             {
327                 cnv->toULength = (int8_t)i;
328                 *err = U_ILLEGAL_CHAR_FOUND;
329                 break;
330             }
331         }
332     }
333 
334 donefornow:
335     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
336     {   /* End of target buffer */
337         *err = U_BUFFER_OVERFLOW_ERROR;
338     }
339 
340     args->target = myTarget;
341     args->source = (const char *) mySource;
342     args->offsets = myOffsets;
343 }
344 
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
346                                     UErrorCode * err)
347 {
348     UConverter *cnv = args->converter;
349     const UChar *mySource = args->source;
350     const UChar *sourceLimit = args->sourceLimit;
351     uint8_t *myTarget = (uint8_t *) args->target;
352     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
353     uint8_t *tempPtr;
354     UChar32 ch;
355     uint8_t tempBuf[4];
356     int32_t indexToWrite;
357     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
358 
359     if (cnv->fromUChar32 && myTarget < targetLimit)
360     {
361         ch = cnv->fromUChar32;
362         cnv->fromUChar32 = 0;
363         goto lowsurrogate;
364     }
365 
366     while (mySource < sourceLimit && myTarget < targetLimit)
367     {
368         ch = *(mySource++);
369 
370         if (ch < 0x80)        /* Single byte */
371         {
372             *(myTarget++) = (uint8_t) ch;
373         }
374         else if (ch < 0x800)  /* Double byte */
375         {
376             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
377             if (myTarget < targetLimit)
378             {
379                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
380             }
381             else
382             {
383                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
384                 cnv->charErrorBufferLength = 1;
385                 *err = U_BUFFER_OVERFLOW_ERROR;
386             }
387         }
388         else {
389             /* Check for surrogates */
390             if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
391 lowsurrogate:
392                 if (mySource < sourceLimit) {
393                     /* test both code units */
394                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
395                         /* convert and consume this supplementary code point */
396                         ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
397                         ++mySource;
398                         /* exit this condition tree */
399                     }
400                     else {
401                         /* this is an unpaired trail or lead code unit */
402                         /* callback(illegal) */
403                         cnv->fromUChar32 = ch;
404                         *err = U_ILLEGAL_CHAR_FOUND;
405                         break;
406                     }
407                 }
408                 else {
409                     /* no more input */
410                     cnv->fromUChar32 = ch;
411                     break;
412                 }
413             }
414 
415             /* Do we write the buffer directly for speed,
416             or do we have to be careful about target buffer space? */
417             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
418 
419             if (ch <= MAXIMUM_UCS2) {
420                 indexToWrite = 2;
421                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
422             }
423             else {
424                 indexToWrite = 3;
425                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
426                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
427             }
428             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
429             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
430 
431             if (tempPtr == myTarget) {
432                 /* There was enough space to write the codepoint directly. */
433                 myTarget += (indexToWrite + 1);
434             }
435             else {
436                 /* We might run out of room soon. Write it slowly. */
437                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
438                     if (myTarget < targetLimit) {
439                         *(myTarget++) = *tempPtr;
440                     }
441                     else {
442                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
443                         *err = U_BUFFER_OVERFLOW_ERROR;
444                     }
445                 }
446             }
447         }
448     }
449 
450     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
451     {
452         *err = U_BUFFER_OVERFLOW_ERROR;
453     }
454 
455     args->target = (char *) myTarget;
456     args->source = mySource;
457 }
458 
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
460                                                   UErrorCode * err)
461 {
462     UConverter *cnv = args->converter;
463     const UChar *mySource = args->source;
464     int32_t *myOffsets = args->offsets;
465     const UChar *sourceLimit = args->sourceLimit;
466     uint8_t *myTarget = (uint8_t *) args->target;
467     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
468     uint8_t *tempPtr;
469     UChar32 ch;
470     int32_t offsetNum, nextSourceIndex;
471     int32_t indexToWrite;
472     uint8_t tempBuf[4];
473     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
474 
475     if (cnv->fromUChar32 && myTarget < targetLimit)
476     {
477         ch = cnv->fromUChar32;
478         cnv->fromUChar32 = 0;
479         offsetNum = -1;
480         nextSourceIndex = 0;
481         goto lowsurrogate;
482     } else {
483         offsetNum = 0;
484     }
485 
486     while (mySource < sourceLimit && myTarget < targetLimit)
487     {
488         ch = *(mySource++);
489 
490         if (ch < 0x80)        /* Single byte */
491         {
492             *(myOffsets++) = offsetNum++;
493             *(myTarget++) = (char) ch;
494         }
495         else if (ch < 0x800)  /* Double byte */
496         {
497             *(myOffsets++) = offsetNum;
498             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
499             if (myTarget < targetLimit)
500             {
501                 *(myOffsets++) = offsetNum++;
502                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
503             }
504             else
505             {
506                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
507                 cnv->charErrorBufferLength = 1;
508                 *err = U_BUFFER_OVERFLOW_ERROR;
509             }
510         }
511         else
512         /* Check for surrogates */
513         {
514             nextSourceIndex = offsetNum + 1;
515 
516             if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
517 lowsurrogate:
518                 if (mySource < sourceLimit) {
519                     /* test both code units */
520                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
521                         /* convert and consume this supplementary code point */
522                         ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
523                         ++mySource;
524                         ++nextSourceIndex;
525                         /* exit this condition tree */
526                     }
527                     else {
528                         /* this is an unpaired trail or lead code unit */
529                         /* callback(illegal) */
530                         cnv->fromUChar32 = ch;
531                         *err = U_ILLEGAL_CHAR_FOUND;
532                         break;
533                     }
534                 }
535                 else {
536                     /* no more input */
537                     cnv->fromUChar32 = ch;
538                     break;
539                 }
540             }
541 
542             /* Do we write the buffer directly for speed,
543             or do we have to be careful about target buffer space? */
544             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
545 
546             if (ch <= MAXIMUM_UCS2) {
547                 indexToWrite = 2;
548                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
549             }
550             else {
551                 indexToWrite = 3;
552                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
553                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
554             }
555             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
556             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
557 
558             if (tempPtr == myTarget) {
559                 /* There was enough space to write the codepoint directly. */
560                 myTarget += (indexToWrite + 1);
561                 myOffsets[0] = offsetNum;
562                 myOffsets[1] = offsetNum;
563                 myOffsets[2] = offsetNum;
564                 if (indexToWrite >= 3) {
565                     myOffsets[3] = offsetNum;
566                 }
567                 myOffsets += (indexToWrite + 1);
568             }
569             else {
570                 /* We might run out of room soon. Write it slowly. */
571                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
572                     if (myTarget < targetLimit)
573                     {
574                         *(myOffsets++) = offsetNum;
575                         *(myTarget++) = *tempPtr;
576                     }
577                     else
578                     {
579                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
580                         *err = U_BUFFER_OVERFLOW_ERROR;
581                     }
582                 }
583             }
584             offsetNum = nextSourceIndex;
585         }
586     }
587 
588     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
589     {
590         *err = U_BUFFER_OVERFLOW_ERROR;
591     }
592 
593     args->target = (char *) myTarget;
594     args->source = mySource;
595     args->offsets = myOffsets;
596 }
597 
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
599                                                UErrorCode *err) {
600     UConverter *cnv;
601     const uint8_t *sourceInitial;
602     const uint8_t *source;
603     uint16_t extraBytesToWrite;
604     uint8_t myByte;
605     UChar32 ch;
606     int8_t i, isLegalSequence;
607 
608     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
609 
610     cnv = args->converter;
611     sourceInitial = source = (const uint8_t *)args->source;
612     if (source >= (const uint8_t *)args->sourceLimit)
613     {
614         /* no input */
615         *err = U_INDEX_OUTOFBOUNDS_ERROR;
616         return 0xffff;
617     }
618 
619     myByte = (uint8_t)*(source++);
620     if (myByte < 0x80)
621     {
622         args->source = (const char *)source;
623         return (UChar32)myByte;
624     }
625 
626     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
627     if (extraBytesToWrite == 0) {
628         cnv->toUBytes[0] = myByte;
629         cnv->toULength = 1;
630         *err = U_ILLEGAL_CHAR_FOUND;
631         args->source = (const char *)source;
632         return 0xffff;
633     }
634 
635     /*The byte sequence is longer than the buffer area passed*/
636     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
637     {
638         /* check if all of the remaining bytes are trail bytes */
639         cnv->toUBytes[0] = myByte;
640         i = 1;
641         *err = U_TRUNCATED_CHAR_FOUND;
642         while(source < (const uint8_t *)args->sourceLimit) {
643             if(U8_IS_TRAIL(myByte = *source)) {
644                 cnv->toUBytes[i++] = myByte;
645                 ++source;
646             } else {
647                 /* error even before we run out of input */
648                 *err = U_ILLEGAL_CHAR_FOUND;
649                 break;
650             }
651         }
652         cnv->toULength = i;
653         args->source = (const char *)source;
654         return 0xffff;
655     }
656 
657     isLegalSequence = 1;
658     ch = myByte << 6;
659     switch(extraBytesToWrite)
660     {
661       /* note: code falls through cases! (sic)*/
662     case 6:
663         ch += (myByte = *source);
664         ch <<= 6;
665         if (!UTF8_IS_TRAIL(myByte))
666         {
667             isLegalSequence = 0;
668             break;
669         }
670         ++source;
671     case 5:
672         ch += (myByte = *source);
673         ch <<= 6;
674         if (!UTF8_IS_TRAIL(myByte))
675         {
676             isLegalSequence = 0;
677             break;
678         }
679         ++source;
680     case 4:
681         ch += (myByte = *source);
682         ch <<= 6;
683         if (!UTF8_IS_TRAIL(myByte))
684         {
685             isLegalSequence = 0;
686             break;
687         }
688         ++source;
689     case 3:
690         ch += (myByte = *source);
691         ch <<= 6;
692         if (!UTF8_IS_TRAIL(myByte))
693         {
694             isLegalSequence = 0;
695             break;
696         }
697         ++source;
698     case 2:
699         ch += (myByte = *source);
700         if (!UTF8_IS_TRAIL(myByte))
701         {
702             isLegalSequence = 0;
703             break;
704         }
705         ++source;
706     };
707     ch -= offsetsFromUTF8[extraBytesToWrite];
708     args->source = (const char *)source;
709 
710     /*
711      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
712      * - use only trail bytes after a lead byte (checked above)
713      * - use the right number of trail bytes for a given lead byte
714      * - encode a code point <= U+10ffff
715      * - use the fewest possible number of bytes for their code points
716      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
717      *
718      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
719      * There are no irregular sequences any more.
720      */
721     if (isLegalSequence &&
722         (uint32_t)ch <= MAXIMUM_UTF &&
723         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
724         !U_IS_SURROGATE(ch)
725     ) {
726         return ch; /* return the code point */
727     }
728 
729     for(i = 0; sourceInitial < source; ++i) {
730         cnv->toUBytes[i] = *sourceInitial++;
731     }
732     cnv->toULength = i;
733     *err = U_ILLEGAL_CHAR_FOUND;
734     return 0xffff;
735 }
736 
737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
738 
739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
740 static const UChar32
741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
742 
743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
744 static const UChar32
745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
746 
747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
748 static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
750                   UConverterToUnicodeArgs *pToUArgs,
751                   UErrorCode *pErrorCode) {
752     UConverter *utf8, *cnv;
753     const uint8_t *source, *sourceLimit;
754     uint8_t *target;
755     int32_t targetCapacity;
756     int32_t count;
757 
758     int8_t oldToULength, toULength, toULimit;
759 
760     UChar32 c;
761     uint8_t b, t1, t2;
762 
763     /* set up the local pointers */
764     utf8=pToUArgs->converter;
765     cnv=pFromUArgs->converter;
766     source=(uint8_t *)pToUArgs->source;
767     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
768     target=(uint8_t *)pFromUArgs->target;
769     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
770 
771     /* get the converter state from the UTF-8 UConverter */
772     c=(UChar32)utf8->toUnicodeStatus;
773     if(c!=0) {
774         toULength=oldToULength=utf8->toULength;
775         toULimit=(int8_t)utf8->mode;
776     } else {
777         toULength=oldToULength=toULimit=0;
778     }
779 
780     count=(int32_t)(sourceLimit-source)+oldToULength;
781     if(count<toULimit) {
782         /*
783          * Not enough input to complete the partial character.
784          * Jump to moreBytes below - it will not output to target.
785          */
786     } else if(targetCapacity<toULimit) {
787         /*
788          * Not enough target capacity to output the partial character.
789          * Let the standard converter handle this.
790          */
791         *pErrorCode=U_USING_DEFAULT_WARNING;
792         return;
793     } else {
794         /*
795          * Use a single counter for source and target, counting the minimum of
796          * the source length and the target capacity.
797          * As a result, the source length is checked only once per multi-byte
798          * character instead of twice.
799          *
800          * Make sure that the last byte sequence is complete, or else
801          * stop just before it.
802          * (The longest legal byte sequence has 3 trail bytes.)
803          * Count oldToULength (number of source bytes from a previous buffer)
804          * into the source length but reduce the source index by toULimit
805          * while going back over trail bytes in order to not go back into
806          * the bytes that will be read for finishing a partial
807          * sequence from the previous buffer.
808          * Let the standard converter handle edge cases.
809          */
810         int32_t i;
811 
812         if(count>targetCapacity) {
813             count=targetCapacity;
814         }
815 
816         i=0;
817         while(i<3 && i<(count-toULimit)) {
818             b=source[count-oldToULength-i-1];
819             if(U8_IS_TRAIL(b)) {
820                 ++i;
821             } else {
822                 if(i<utf8_countTrailBytes[b]) {
823                     /* stop converting before the lead byte if there are not enough trail bytes for it */
824                     count-=i+1;
825                 }
826                 break;
827             }
828         }
829     }
830 
831     if(c!=0) {
832         utf8->toUnicodeStatus=0;
833         utf8->toULength=0;
834         goto moreBytes;
835         /* See note in ucnv_SBCSFromUTF8() about this goto. */
836     }
837 
838     /* conversion loop */
839     while(count>0) {
840         b=*source++;
841         if((int8_t)b>=0) {
842             /* convert ASCII */
843             *target++=b;
844             --count;
845             continue;
846         } else {
847             if(b>0xe0) {
848                 if( /* handle U+1000..U+D7FF inline */
849                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
850                                                (b==0xed && (t1 <= 0x9f))) &&
851                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
852                 ) {
853                     source+=2;
854                     *target++=b;
855                     *target++=t1;
856                     *target++=t2;
857                     count-=3;
858                     continue;
859                 }
860             } else if(b<0xe0) {
861                 if( /* handle U+0080..U+07FF inline */
862                     b>=0xc2 &&
863                     (t1=*source) >= 0x80 && t1 <= 0xbf
864                 ) {
865                     ++source;
866                     *target++=b;
867                     *target++=t1;
868                     count-=2;
869                     continue;
870                 }
871             } else if(b==0xe0) {
872                 if( /* handle U+0800..U+0FFF inline */
873                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
874                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
875                 ) {
876                     source+=2;
877                     *target++=b;
878                     *target++=t1;
879                     *target++=t2;
880                     count-=3;
881                     continue;
882                 }
883             }
884 
885             /* handle "complicated" and error cases, and continuing partial characters */
886             oldToULength=0;
887             toULength=1;
888             toULimit=utf8_countTrailBytes[b]+1;
889             c=b;
890 moreBytes:
891             while(toULength<toULimit) {
892                 if(source<sourceLimit) {
893                     b=*source;
894                     if(U8_IS_TRAIL(b)) {
895                         ++source;
896                         ++toULength;
897                         c=(c<<6)+b;
898                     } else {
899                         break; /* sequence too short, stop with toULength<toULimit */
900                     }
901                 } else {
902                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
903                     source-=(toULength-oldToULength);
904                     while(oldToULength<toULength) {
905                         utf8->toUBytes[oldToULength++]=*source++;
906                     }
907                     utf8->toUnicodeStatus=c;
908                     utf8->toULength=toULength;
909                     utf8->mode=toULimit;
910                     pToUArgs->source=(char *)source;
911                     pFromUArgs->target=(char *)target;
912                     return;
913                 }
914             }
915 
916             if( toULength==toULimit &&      /* consumed all trail bytes */
917                 (toULength==3 || toULength==2) &&             /* BMP */
918                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
919                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
920             ) {
921                 /* legal byte sequence for BMP code point */
922             } else if(
923                 toULength==toULimit && toULength==4 &&
924                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
925             ) {
926                 /* legal byte sequence for supplementary code point */
927             } else {
928                 /* error handling: illegal UTF-8 byte sequence */
929                 source-=(toULength-oldToULength);
930                 while(oldToULength<toULength) {
931                     utf8->toUBytes[oldToULength++]=*source++;
932                 }
933                 utf8->toULength=toULength;
934                 pToUArgs->source=(char *)source;
935                 pFromUArgs->target=(char *)target;
936                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
937                 return;
938             }
939 
940             /* copy the legal byte sequence to the target */
941             {
942                 int8_t i;
943 
944                 for(i=0; i<oldToULength; ++i) {
945                     *target++=utf8->toUBytes[i];
946                 }
947                 source-=(toULength-oldToULength);
948                 for(; i<toULength; ++i) {
949                     *target++=*source++;
950                 }
951                 count-=toULength;
952             }
953         }
954     }
955 
956     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
957         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
958             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
959         } else {
960             b=*source;
961             toULimit=utf8_countTrailBytes[b]+1;
962             if(toULimit>(sourceLimit-source)) {
963                 /* collect a truncated byte sequence */
964                 toULength=0;
965                 c=b;
966                 for(;;) {
967                     utf8->toUBytes[toULength++]=b;
968                     if(++source==sourceLimit) {
969                         /* partial byte sequence at end of source */
970                         utf8->toUnicodeStatus=c;
971                         utf8->toULength=toULength;
972                         utf8->mode=toULimit;
973                         break;
974                     } else if(!U8_IS_TRAIL(b=*source)) {
975                         /* lead byte in trail byte position */
976                         utf8->toULength=toULength;
977                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
978                         break;
979                     }
980                     c=(c<<6)+b;
981                 }
982             } else {
983                 /* partial-sequence target overflow: fall back to the pivoting implementation */
984                 *pErrorCode=U_USING_DEFAULT_WARNING;
985             }
986         }
987     }
988 
989     /* write back the updated pointers */
990     pToUArgs->source=(char *)source;
991     pFromUArgs->target=(char *)target;
992 }
993 
994 /* UTF-8 converter data ----------------------------------------------------- */
995 
996 static const UConverterImpl _UTF8Impl={
997     UCNV_UTF8,
998 
999     NULL,
1000     NULL,
1001 
1002     NULL,
1003     NULL,
1004     NULL,
1005 
1006     ucnv_toUnicode_UTF8,
1007     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1008     ucnv_fromUnicode_UTF8,
1009     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1010     ucnv_getNextUChar_UTF8,
1011 
1012     NULL,
1013     NULL,
1014     NULL,
1015     NULL,
1016     ucnv_getNonSurrogateUnicodeSet,
1017 
1018     ucnv_UTF8FromUTF8,
1019     ucnv_UTF8FromUTF8
1020 };
1021 
1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1023 static const UConverterStaticData _UTF8StaticData={
1024     sizeof(UConverterStaticData),
1025     "UTF-8",
1026     1208, UCNV_IBM, UCNV_UTF8,
1027     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1028     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1029     0,
1030     0,
1031     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1032 };
1033 
1034 
1035 const UConverterSharedData _UTF8Data={
1036     sizeof(UConverterSharedData), ~((uint32_t) 0),
1037     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1038     0
1039 };
1040 
1041 /* CESU-8 converter data ---------------------------------------------------- */
1042 
1043 static const UConverterImpl _CESU8Impl={
1044     UCNV_CESU8,
1045 
1046     NULL,
1047     NULL,
1048 
1049     NULL,
1050     NULL,
1051     NULL,
1052 
1053     ucnv_toUnicode_UTF8,
1054     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1055     ucnv_fromUnicode_UTF8,
1056     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1057     NULL,
1058 
1059     NULL,
1060     NULL,
1061     NULL,
1062     NULL,
1063     ucnv_getCompleteUnicodeSet
1064 };
1065 
1066 static const UConverterStaticData _CESU8StaticData={
1067     sizeof(UConverterStaticData),
1068     "CESU-8",
1069     9400, /* CCSID for CESU-8 */
1070     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1071     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1072     0,
1073     0,
1074     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1075 };
1076 
1077 
1078 const UConverterSharedData _CESU8Data={
1079     sizeof(UConverterSharedData), ~((uint32_t) 0),
1080     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1081     0
1082 };
1083 
1084 #endif
1085